├── configuration
├── __init__.py
├── config_parse_hocr_jk.conf
└── config_parse_hocr_js.conf
├── logs
└── var_occurences.json
├── docs
└── img
│ └── docxstruct_logo.png
├── .idea
├── encodings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
├── akf-hocrparser.iml
├── workspace.xml
├── codeStyles
│ └── Project.xml
└── dbnavigator.xml
├── experiments
├── experiments_loop.py
├── experiments_strip.py
├── experiments_number_sizes.py
└── experiment_fuzzy_regex.py
├── .gitmodules
├── tests
├── regex_fuzzy_search.py
└── strip_if_not_none.py
├── .gitignore
├── lib
├── akf_known_uncategories.py
├── akf_parsing_functions_tables_one.py
├── dictionary_handler.py
├── segment.py
├── additional_info_handler.py
├── snippet_ocr.py
├── akf_parsing_functions_jk.py
├── feature_extractor.py
├── segment_parser.py
├── segment_parser_endobject_factory.py
├── data_helper.py
├── segment_classifier.py
└── akf_parsing_functions_one.py
├── additionals
└── dictionaries
│ ├── dictionary_income.json
│ └── dictionary_balance.json
├── main_start.py
├── parser.py
├── LICENSE
└── README.md
/configuration/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/logs/var_occurences.json:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/img/docxstruct_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UB-Mannheim/docxstruct/master/docs/img/docxstruct_logo.png
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/experiments/experiments_loop.py:
--------------------------------------------------------------------------------
1 | all_texts = ["asd", "fgh"]
2 |
3 | for text in all_texts:
4 | print("text:", text)
5 | all_texts.append(text)
6 |
7 | print("done", all_texts)
8 |
--------------------------------------------------------------------------------
/experiments/experiments_strip.py:
--------------------------------------------------------------------------------
1 | test_text = "This is a test.,.,, "
2 | print("test_text", test_text)
3 | stripped_text = test_text.strip("., ")
4 | print("stripped_text", stripped_text)
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "akf_corelib"]
2 | path = akf_corelib
3 | url = https://github.com/UB-Mannheim/akf-corelib.git
4 | [submodule "hocr_parser"]
5 | path = hocr_parser
6 | url = https://github.com/UB-Mannheim/hocr_parser.git
7 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/tests/regex_fuzzy_search.py:
--------------------------------------------------------------------------------
1 | #todo add testing and check for regex fuzzy search implementation (with error_number correctness)
2 |
3 | from akf_corelib.regex_util import RegexUtil as regu
4 |
5 |
6 | text = "my test text"
7 | match, errs = regu.fuzzy_search(r"", text, err_number=0)
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/akf-hocrparser.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/experiments/experiments_number_sizes.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def show_sizeof(x, level=0):
5 |
6 | print("\t" * level, x.__class__, sys.getsizeof(x), x)
7 |
8 | if hasattr(x, '__iter__'):
9 | if hasattr(x, 'items'):
10 | for xx in x.items():
11 | show_sizeof(xx, level + 1)
12 | else:
13 | for xx in x:
14 | show_sizeof(xx, level + 1)
15 |
16 |
17 | show_sizeof(None)
18 | show_sizeof(3)
19 | show_sizeof(2**63)
20 | show_sizeof(102947298469128649161972364837164)
21 | show_sizeof(918659326943756134897561304875610348756384756193485761304875613948576297485698417)
22 |
23 | print("One variable test")
24 | gets_bigger = 3
25 | show_sizeof(gets_bigger)
26 |
27 | gets_bigger += 102947298469128649161972364837164
28 | show_sizeof(gets_bigger)
29 |
30 |
31 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | 1532015875915
23 |
24 |
25 | 1532015875915
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/tests/strip_if_not_none.py:
--------------------------------------------------------------------------------
1 | from lib.data_helper import DataHelper as dh
2 |
3 |
4 | # single trail sc
5 | test_text_1 = "this is my text)"
6 | test_result_1 = dh.strip_if_not_none(test_text_1, ")., ")
7 | test_result_1s = test_text_1.strip(")., ")
8 | test_result_1r = dh.remove_multiple_outbound_chars(test_text_1)
9 |
10 | # multi trail sc
11 | test_text_2 = "this is my text)..."
12 | test_result_2 = dh.strip_if_not_none(test_text_2, ")., ")
13 | test_result_2s = test_text_2.strip(")., ")
14 | test_result_2r = dh.remove_multiple_outbound_chars(test_text_2)
15 |
16 |
17 | # single start sc multi trail sc
18 | test_text_3 = ")this is my text)..."
19 | test_result_3 = dh.strip_if_not_none(test_text_3, ")., ")
20 | test_result_3s = test_text_3.strip(")., ")
21 | test_result_3r = dh.remove_multiple_outbound_chars(test_text_3)
22 |
23 | # multi start sc multi trail sc
24 | test_text_4 = ")....this is my text)..."
25 | test_result_4 = dh.strip_if_not_none(test_text_4, ")., ")
26 | test_result_4s = test_text_4.strip(")., ")
27 | test_result_4r = dh.remove_multiple_outbound_chars(test_text_4)
28 |
29 |
30 | # with spaces
31 | test_text_5 = ").. ..this is my text). .."
32 | test_result_5 = dh.strip_if_not_none(test_text_5, ")., ")
33 | test_result_5s = test_text_5.strip(")., ")
34 | test_result_5r = dh.remove_multiple_outbound_chars(test_text_5)
35 |
36 |
37 | # non-pattern break
38 | test_text_6 = ").(...this is my text).(.."
39 | test_result_6 = dh.strip_if_not_none(test_text_6, ")., ")
40 | test_result_6s = test_text_6.strip(")., ")
41 | test_result_6r = dh.remove_multiple_outbound_chars(test_text_6)
42 |
43 | print("done")
44 |
45 |
46 |
47 | # strip for comparison
48 |
49 | test_strip = " u."
50 | test_strip_1 = test_strip.strip(". ")
51 |
52 |
53 | print("done2")
54 |
55 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | output/
2 | output_save/
3 | laptopdata/
4 | AKFII_ocromore_results_local/
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 |
11 | # C extensions
12 | *.so
13 |
14 | # Distribution / packaging
15 | .Python
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 |
63 | # Flask stuff:
64 | instance/
65 | .webassets-cache
66 |
67 | # Scrapy stuff:
68 | .scrapy
69 |
70 | # Sphinx documentation
71 | docs/_build/
72 |
73 | # PyBuilder
74 | target/
75 |
76 | # Jupyter Notebook
77 | .ipynb_checkpoints
78 |
79 | # pyenv
80 | .python-version
81 |
82 | # celery beat schedule file
83 | celerybeat-schedule
84 |
85 | # SageMath parsed files
86 | *.sage.py
87 |
88 | # Environments
89 | .env
90 | .venv
91 | env/
92 | venv/
93 | ENV/
94 | env.bak/
95 | venv.bak/
96 |
97 | # Spyder project settings
98 | .spyderproject
99 | .spyproject
100 |
101 | # Rope project settings
102 | .ropeproject
103 |
104 | # mkdocs documentation
105 | /site
106 |
107 | # mypy
108 | .mypy_cache/
109 |
--------------------------------------------------------------------------------
/experiments/experiment_fuzzy_regex.py:
--------------------------------------------------------------------------------
1 | import re
2 | import regex # backwards compatible to 're', but additional functionality
3 | # https://pypi.org/project/regex/ ---> 'fuzzy'-matches
4 | from akf_corelib.regex_util import RegexUtil as regu
5 |
6 | test_texts = [
7 | "Fernschreiber:",
8 | "Fernschreiber :",
9 | "F3rnschreiber:",
10 | "F3pnschreiber:",
11 | "ernschreiber:",
12 | "ernschr3iber:",
13 | "Fernschreiber!",
14 | "asdwevc!"
15 | ]
16 |
17 |
18 | example = regex.fullmatch(r"(?:cats|cat){e<=1}", "cat").fuzzy_counts
19 | print("Example is:", example)
20 |
21 | def regexfuzzy_search(pattern, text ,err_number=2):
22 | compiled_wrapper = regex.compile(r"(?:"+pattern+"){e<="+str(err_number)+"}")
23 | result = compiled_wrapper.search(text)
24 | return result
25 |
26 |
27 | # costs of insert, delete, substitute can be defined {2i+2d+1s<=4} each insertion costs 2 etc
28 | def test_1():
29 | for text in test_texts:
30 | compiled = regex.compile(r"(?:^Fernschreiber\s?:){e<=1}")
31 | match_stop = compiled.search(text)
32 | if match_stop is not None:
33 | (substs, inserts, deletions) = match_stop.fuzzy_counts
34 | accumulated_errs = substs + inserts + deletions
35 |
36 | print("Text is:", text, "Match is True", "Errors:", (substs, inserts, deletions) )
37 | else:
38 | print("Text is:", text, "Match is False", "Errors: higher than limit")
39 |
40 |
41 | # search with dynamic wrapper function (better looking regex)
42 | for text in test_texts:
43 | match_stop = regexfuzzy_search("^Fernschreiber\s:", text)
44 | if match_stop is not None:
45 | (substs, inserts, deletions) = match_stop.fuzzy_counts
46 | accumulated_errs = substs + inserts + deletions
47 |
48 | print("Text is:", text, "Match is True", "Errors:", (substs, inserts, deletions))
49 | else:
50 | print("Text is:", text, "Match is False", "Errors: higher than limit")
51 |
52 |
53 |
54 | match_shorter_text, errs = regu.fuzzy_search("^Texte", "Text", err_number=2)
55 | #if match_shorter_text:
56 | #result = match_shorter_text.text
57 |
58 | # jk example
59 | match_shorter_text2, errs2 = regu.fuzzy_search("^rückstellungen$", "rücksstellungen", err_number=2)
60 | if match_shorter_text2:
61 | result = match_shorter_text2.text
--------------------------------------------------------------------------------
/lib/akf_known_uncategories.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | class KnownUncategories(object):
4 | """
5 | List of known entries in test_data which are no categories,
6 | but are recognized as such
7 | """
8 |
9 | def __init__(self):
10 |
11 | # un-category regex strings (care for commas)
12 | self.uc = [
13 | "Beteiligung", # 1956: is part of Beteiligungen
14 | "Ferngespräche", # 1956: is part of Fernruf/Telefon
15 | "Kapital", # 1956: is part of multiple top-level items
16 | "Umstellung \d\d?", # 1956: is part of Grundkapital or other
17 | "Dividenden ab \d{4}.*", # 1956: is part of Dividenden or other (with year or yearspan)s
18 | "^Kurs.*", # 1956: second level tag
19 | "ab \d{4}(\/\d{2})?" # 1956: i.e "ab 1949/50"-part of other categories
20 | ]
21 |
22 | # non-specific keys (which get not removed from original-rest in analysis)
23 | self.nkeys = [
24 | "street",
25 | "street_number",
26 | "additional_info",
27 | "city",
28 | "name",
29 | "title",
30 | "rest",
31 | "location",
32 | "number_Sa.-Nr.",
33 | "rest_info",
34 | "bank",
35 | "title",
36 | "amount",
37 | "ord_number",
38 | "organization",
39 |
40 | ]
41 |
42 | # create corresponding regexes
43 | self.uc_regex = []
44 | for item in self.uc:
45 | regex_compiled = re.compile(item)
46 | self.uc_regex.append(regex_compiled)
47 |
48 | @property
49 | def uncategories(self):
50 | return self.uc
51 |
52 | @property
53 | def unkeys(self):
54 | return self.nkeys
55 |
56 | def check_uncategories(self, text_to_check):
57 | """
58 | Allows to compare a tag against the existing uncategories
59 | :param text_to_check: tag text
60 | :return: True if un-category, False if not
61 | """
62 | for regex_to_check in self.uc_regex:
63 | match_result = regex_to_check.search(text_to_check)
64 | if match_result is not None:
65 | return True
66 |
67 | return False
--------------------------------------------------------------------------------
/.idea/codeStyles/Project.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/lib/akf_parsing_functions_tables_one.py:
--------------------------------------------------------------------------------
1 | from akf_corelib.conditional_print import ConditionalPrint
2 | from akf_corelib.configuration_handler import ConfigurationHandler
3 | from .data_helper import DataHelper as dh
4 | from .akf_parsing_functions_common import AKFCommonParsingFunctions as cf
5 | from akf_corelib.regex_util import RegexUtil as regu
6 |
7 | import regex
8 |
9 |
10 | class AkfParsingFunctionsTablesOne(object):
11 |
12 | def __init__(self, endobject_factory, output_analyzer, dictionary_handler):
13 | config_handler = ConfigurationHandler(first_init=False)
14 |
15 | self.config = config_handler.get_config()
16 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_TABLES_ONE, self.config.PRINT_EXCEPTION_LEVEL,
17 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
18 |
19 | self.cpr.print("init akf parsing functions tables one")
20 |
21 | self.ef = endobject_factory
22 | self.output_analyzer = output_analyzer
23 | self.dictionary_handler = dictionary_handler
24 |
25 |
26 | def parse_aktienkurse(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
27 | # get basic data
28 | element_counter = 0
29 | origpost, origpost_red, element_counter, content_texts = \
30 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
31 |
32 | # logme
33 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)
34 |
35 |
36 | def parse_dividenden(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
37 | # get basic data
38 | element_counter = 0
39 | origpost, origpost_red, element_counter, content_texts = \
40 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
41 |
42 | # logme
43 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)
44 |
45 |
46 | def parse_dividenden_auf_xyaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
47 | # get basic data
48 | element_counter = 0
49 | origpost, origpost_red, element_counter, content_texts = \
50 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
51 |
52 | # logme
53 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)
54 |
--------------------------------------------------------------------------------
/lib/dictionary_handler.py:
--------------------------------------------------------------------------------
1 | from akf_corelib.conditional_print import ConditionalPrint
2 | from akf_corelib.configuration_handler import ConfigurationHandler
3 | from .data_helper import DataHelper as dh
4 |
5 | import regex
6 | import json
7 | import os
8 |
9 |
10 | class DictionaryHandler(object):
11 |
12 | def __init__(self):
13 | config_handler = ConfigurationHandler(first_init=False)
14 |
15 | self.config = config_handler.get_config()
16 | self.cpr = ConditionalPrint(self.config.PRINT_DICTIONARY_HANDLER, self.config.PRINT_EXCEPTION_LEVEL,
17 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
18 |
19 | self.cpr.print("init dictionary handler")
20 | self.data_functs = None # storage for json object
21 | self.data_titles = None # storage for json object
22 | self.texts_functs = None
23 | self.texts_titles = None
24 | if self.config.USE_DICTIONARIES_FOR_PERSON_PARSING:
25 | self.load_dictionaries()
26 | # get the rows as sorted list of texts longest first
27 | if self.data_functs is not None:
28 | check_tf = self.sort_rows(self.get_rows(self.data_functs))
29 | self.texts_functs = check_tf
30 | if self.data_titles is not None:
31 | check_tt = self.sort_rows(self.get_rows(self.data_titles))
32 | self.texts_titles = check_tt
33 |
34 | def diff_name_title(self, text_to_check):
35 |
36 | len_text_to_check = len(text_to_check)
37 | name_found = text_to_check
38 | title_found = ""
39 |
40 | for entry_index, entry in enumerate(self.texts_titles):
41 | title, tlen = entry
42 | # accelerate the process, by skipping comparisons which have longer texts
43 | if tlen > len_text_to_check:
44 | continue
45 | # compare the texts
46 | if title in text_to_check:
47 | name_found = text_to_check.replace(title, "", 1).strip()
48 | title_found = title
49 | break
50 |
51 |
52 | return name_found, title_found
53 |
54 | def load_dictionaries(self):
55 | base_dict_path = self.get_dict_path()
56 |
57 | filepath_titles_dict = os.path.join(base_dict_path, "dict_titles.json")
58 | filepath_functs_dict = os.path.join(base_dict_path, "dict_functs.json")
59 |
60 | # load titles
61 | if os.path.exists(filepath_titles_dict):
62 | with open(filepath_titles_dict) as f:
63 | self.data_titles = json.load(f)
64 | else:
65 | self.cpr.printex("dictionary dict_titles.json missing at specificied path",filepath_titles_dict)
66 |
67 | # load functs
68 | if os.path.exists(filepath_functs_dict):
69 | with open(filepath_functs_dict) as f:
70 | self.data_functs = json.load(f)
71 | else:
72 | self.cpr.printex("dictionary dict_functs.json missing at specificied path",filepath_functs_dict)
73 |
74 |
75 | def get_rows(self, dict_data):
76 | rows = dict_data['rows']
77 | final_rows = []
78 | for entry in rows:
79 | text = entry[0]
80 | final_rows.append((text,len(text)))
81 | return final_rows
82 |
83 | def sort_rows(self, rows):
84 | #itemgetter(1),
85 | rows.sort(key=lambda t: len(t[0]), reverse=True)
86 | return rows
87 |
88 | def path(self):
89 | return os.getcwd()
90 |
91 | def get_dict_path(self):
92 | complete = os.path.join(self.path(),"additionals","dictionaries")
93 | return complete
--------------------------------------------------------------------------------
/lib/segment.py:
--------------------------------------------------------------------------------
1 | import abc
2 |
3 |
4 |
5 | class Segment(object):
6 | """
7 | Root segment class for a classification segments,
8 | child specialized sgments are stored in SegmentHolder
9 | class.
10 | """
11 | __metaclass__ = abc.ABCMeta
12 |
13 | def __init__(self, segment_tag):
14 | self.start_was_segmented = False
15 | self.stop_was_segmented = False
16 | self.start_error_number = 0
17 | self.stop_error_number = 0
18 |
19 | self.enabled = True
20 | self.only = False
21 | self.start_line_index = -1
22 | self.stop_line_index = -1
23 | self.key_tag_cindex_start = -1 # character index of keytag: 'Vorstand: Name' ---> 0
24 | self.key_tag_cindex_stop = -1 # character index of keytag: 'Vorstand: Name' ---> 9
25 | self.restcontent_in_start_line = -1
26 | self.segment_tag = segment_tag
27 | self.snippet = None
28 | self.info_handler = None
29 |
30 | def disable(self):
31 | self.enabled = False
32 |
33 | def set_only(self):
34 | self.only = True
35 |
36 | def set_start_error_number(self, start_error_number):
37 | self.start_error_number = start_error_number
38 |
39 | def get_start_error_number(self):
40 | return self.start_error_number
41 |
42 | def set_stop_error_number(self, stop_error_number):
43 | self.stop_error_number = stop_error_number
44 |
45 | def get_stop_error_number(self):
46 | return self.stop_error_number
47 |
48 | def get_start_line_index(self):
49 | return self.start_line_index
50 |
51 | def get_stop_line_index(self):
52 | return self.stop_line_index
53 |
54 | def get_segment_tag(self):
55 | return self.segment_tag
56 |
57 | def do_match_work(self, start_or_stop, match, line_index, match_errors):
58 | if start_or_stop is True: # it's a start match
59 | self.set_keytag_indices(match) # this separates keytag from rest of line
60 | self.start_line_index = line_index
61 | self.start_was_segmented = True
62 | self.set_start_error_number(match_errors)
63 | else:
64 | self.stop_line_index = line_index
65 | self.stop_was_segmented = True
66 | self.set_stop_error_number(match_errors)
67 |
68 | @abc.abstractmethod
69 | def match_start_condition(self, line, line_text, line_index, features, num_lines, prev_line, combined_texts):
70 | return # 0 # return number 0 for indication undefined, don't return this in overwritten conditions
71 |
72 | @abc.abstractmethod
73 | def match_stop_condition(self, line, line_text, line_index, features, num_lines, prev_line, combined_texts):
74 | # by default don't assign any stop condition, leave at initial value
75 | # self.stop_line_index = self.start_line_index
76 | return # 0 # return number 0 for indication undefined, don't return this in overwritten conditions
77 |
78 | def start_or_stop_segmented(self):
79 | if self.start_was_segmented or self.stop_was_segmented:
80 | return True
81 | else:
82 | return False
83 |
84 | def is_start_segmented(self):
85 | return self.start_was_segmented
86 |
87 | def is_stop_segmented(self):
88 | return self.stop_was_segmented
89 |
90 | def set_stop_segmented(self, stop_index):
91 | self.stop_line_index = stop_index
92 | self.stop_was_segmented = True
93 |
94 | def set_start_segmented(self, start_index):
95 | self.start_line_index = start_index
96 | self.start_was_segmented = True
97 |
98 | def set_keytag_indices(self, match):
99 | """
100 | From regex match set the keytag indices, takes 1st occurence,
101 | also checks if there is restcontent besides the match in the
102 | line to check
103 | :param match: regex match
104 | :return:
105 | """
106 | start_m = match.regs[0][0]
107 | stop_m = match.regs[0][1]
108 |
109 | self.key_tag_cindex_start = start_m
110 | self.key_tag_cindex_stop = stop_m
111 | len_match = stop_m-start_m
112 | len_rest = len(match.string)-len_match
113 | if len_rest > 0:
114 | self.restcontent_in_start_line = len_rest
--------------------------------------------------------------------------------
/additionals/dictionaries/dictionary_income.json:
--------------------------------------------------------------------------------
1 | {
2 | "Zusatz":
3 | {
4 | "darunter": "",
5 | "Sonstiges": "",
6 | "Sonst.": "",
7 | "Langfristige": "",
8 | "Langfr." : "",
9 | "Durchlaufende": "",
10 | "dauernde": ""
11 | },
12 | "Hauptpunkte":
13 | {
14 | "rechnungen Löhne und Gehälter": "Löhne und Gehälter",
15 | "Löhne und Gehälter": "Löhne und Gehälter",
16 | "Abschreibungen u. Werberichtigung": "Abschreibungen und Werberichtigung",
17 | "Abschreibungen": "Abschreibungen",
18 | "Jahresrohertrag": "Jahresrohertrag",
19 | "Jahresertrag": "Jahresertrag",
20 | "Steuern": "Steuern",
21 | "Beteiligungserträge": "Beteiligungserträge",
22 | "Gehälter": "Gehälter",
23 | "Ausweispfl. Steuern": "Ausweispfl. Steuern",
24 | "Personalaufwendungen": "Personalaufwendungen",
25 | "Abschreibungen auf Anlagen": "Abschreibungen auf Anlagen",
26 | "auf Anlagen": "Abschreibungen auf Anlagen",
27 | "Steuern u. ähnl. Abgaben": "Steuern und ähnl. Abgaben",
28 | "Steuern und ähnl. Abgaben": "Steuern und ähnl. Abgaben",
29 | "Dividenden aus Beteiligungen":"Dividenden aus Beteiligungen" ,
30 | "Zinsen u. Diskonterträge": "Zinsen u. Diskonterträge",
31 | "Provisionen, Gebühren u.ähnl. Erträge": "Provisionen, Gebühren u.ähnl. Erträge",
32 | "Zins und Diskonterträge": "Zins und Diskonterträge",
33 | "Zinsaufwendungen": "Zinsaufwendungen",
34 | "Zins und Provisionserträge": "Zins und Provisionserträge",
35 | "Betriebsergebnisse": "Betriebsergebnisse",
36 | "Verwaltungskosten": "Verwaltungskosten",
37 | "Steuern und öffentliche Abgaben": "Steuern und öffentliche Abgaben",
38 | "Zahlungen für Versicherungsfälle": "Zahlungen für Versicherungsfälle",
39 | "Beitragseinnahmen": "Beitragseinnahmen",
40 | "Erträge aus Beteiligungen": "Erträge aus Beteiligungen",
41 | "Steuern u.öffentliche Abgaben": "Steuern und öffentliche Abgaben",
42 | "Beteiligungsertrag": "Beteiligungsertrag",
43 | "Rückversicherungsbeiträge": "Rückversicherungsbeiträge",
44 | "Verschiedene Unkosten": "Verschiedene Unkosten",
45 | "Allgemeine Unkosten": "Allgemeine Unkosten",
46 | "Zinserträge": "Zinserträge",
47 | "Ertragssteuern":"Ertragssteuern",
48 | "Abschreibungen und Wertberichtigungen":"Abschreibungen und Wertberichtigungen",
49 | "Wertberichtigung":"Wertberichtigung",
50 | "Gebühren und ähnl. Erträge": "Gebühren und ähnl. Erträge",
51 | "Zinsen und Diskonterträge": "Zinsen und Diskonterträge",
52 | "Allg. Unkosten, Gehälter u. Steuern": "Allg. Unkosten, Gehälter u. Steuern",
53 | "Provisionen und sonst. Erträge": "Provisionen und sonst. Erträge",
54 | "Verschiedene Einnahmen": "Verschiedene Einnahmen",
55 | "Verwaltungsunkosten": "Verwaltungsunkosten",
56 | "Warenkonto": "Warenkonto",
57 | "ähnl. einmalige Erträge": "ähnl. einmalige Erträge",
58 | "Gehälter u. Pensionen": "Gehälter u. Pensionen",
59 | "Zins u. Prov. Einn.": "Zins u. Prov. Einn.",
60 | "Provisionen, Gebühren und ähnl.Erträge": "Provisionen, Gebühren und ähnl.Erträge",
61 | "Anlagevermögen und anderes": "Anlagevermögen und anderes",
62 | "Steuern und soziale Aufwendungen": "Steuern und soziale Aufwendungen",
63 | "sehr.u.aufgenommene Darlehen": "sehr.u.aufgenommene Darlehen",
64 | "Darlehensprovisionen etc.": "Darlehensprovisionen etc.",
65 | "Erstattung der Liquidationskosten": "Erstattung der Liquidationskosten",
66 | "Steuern und Umlagen": "Steuern und Umlagen",
67 | "Provisionen u. sonstige Erträge": "Provisionen und sonst. Erträge",
68 | "Zins u. ähnliche Erträge": "Zins u. ähnliche Erträge",
69 | "Jahreseinnahmen": "Jahreseinnahmen",
70 | "Gehälter und Pensionen": "Gehälter und Pensionen",
71 | "Rohüberschuss": "Rohüberschuss",
72 | "Miet und Pachterträge": "Miet und Pachterträge",
73 | "Nettoverkaufserlös": "Nettoverkaufserlös",
74 | "EEV-Steuern u. LAG": "EEV-Steuern u. LAG",
75 | "Jahresüberschuß": "Jahresüberschuß",
76 | "Jahresfehlbetrag": "Jahresfehlbetrag",
77 | "Herstellungsaufwand":"Herstellungsaufwand",
78 | "Sachaufwand":"Sachaufwand",
79 | "Einmalige Aufwendungen":"Einmalige Aufwendungen",
80 | "Gehälter, Löhne u. Sozialabgaben":"Gehälter, Löhne u. Sozialabgaben",
81 | "Überschuß des Geschäftsjahres":"Überschuß des Geschäftsjahres",
82 | "EEV-Steuern": "EEV-Steuern",
83 | "Besitzsteuern": "Besitzsteuern",
84 | "Gewinnabführung": "Gewinnabführung",
85 | "Materialaufwand, Fremdleistung": "Materialaufwand und Fremdleistung",
86 | "Materialaufwand": "Materialaufwand",
87 | "Verkaufs und allgemeine Unkosten":"Verkaufs und allgemeine Unkosten",
88 | "Vermögensfreigaben im Ausland":"Vermögensfreigaben im Ausland",
89 | "Personalaufwand": "Personalaufwand",
90 | "Betriebsgewinnanteil": "Betriebsgewinnanteil",
91 | "Betriebsausgaben": "Betriebsausgaben",
92 | "Leistungen für Versicherungsfälle": "Leistungen für Versicherungsfälle",
93 | "Erträge des eigenen Verkehrsbetriebes": "Erträge des eigenen Verkehrsbetriebes",
94 | "Produktions u. Verwaltungskosten": "Produktions u. Verwaltungskosten",
95 | "Wertberichtigungen u. Rückstellungen": "Wertberichtigungen u. Rückstellungen",
96 | "Personalaufwendungen ohne Sozialleistungen": "Personalaufwendungen ohne Sozialleistungen",
97 | "Gebührenrohüberschuss": "Gebührenrohüberschuss"
98 | },
99 | "Unterpunkte": {}
100 | }
--------------------------------------------------------------------------------
/lib/additional_info_handler.py:
--------------------------------------------------------------------------------
1 | from akf_corelib.conditional_print import ConditionalPrint
2 | from akf_corelib.configuration_handler import ConfigurationHandler
3 |
4 |
5 | import json
6 | import glob
7 | import pandas as pd
8 | from os import path
9 |
10 |
11 |
12 | class AdditionalInfoHandler(object):
13 |
14 | def __init__(self):
15 | config_handler = ConfigurationHandler(first_init=False)
16 |
17 | self.config = config_handler.get_config()
18 | self.cpr = ConditionalPrint(self.config.PRINT_ADDITIONAL_INFO_HANDLER, self.config.PRINT_EXCEPTION_LEVEL,
19 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
20 | self.cpr.print("init additional info handler")
21 |
22 |
23 | def write_excel_to_json(self, fileinfo,filepath,filetype,idxcol=None,parse_cols=None,page=0):
24 | """"
25 | At the moment a little helper script for the Aktienführer-Project.
26 | Be free to modify as you wish.
27 | """
28 | #if isinstance(parse_cols, list): parse_cols = [parse_cols],
29 | additional_filepath = path.normpath(f"{filepath}/**/*{fileinfo.dbname}.{filetype}")
30 | file = glob.glob(additional_filepath,recursive=True)
31 | if len(file)!= 1: return None
32 | if filetype in ["xlsx","xls"]:
33 | df = pd.read_excel(file[0]).set_index("ProfileID")
34 | jsondata = {fileinfo.dbname:{"Year":fileinfo.dbname}}
35 | jsondf = df.to_dict(orient="index")
36 | jsondata.update(jsondf)
37 | with open(file[0].replace("xlsx","json"),"w") as output:
38 | json.dump(jsondata, output,indent=4)
39 | return None
40 |
41 | def fetch_additional_information_simple(self, file):
42 | """
43 | Same as fetch additional information, but config related info is already included in given
44 | parameters
45 | :return: additional info
46 | """
47 | if self.config.ADDITIONAL_INFORMATION:
48 | additional_info = self.fetch_additional_information(file, self.config.INPUT_ADDINFOPATH,
49 | idxcol= self.config.IDXCOL,parse_cols=self.config.PARSE_COLS,
50 | filetype =self.config.INPUT_ADDINFOFILETPYE)
51 | return additional_info
52 |
53 | return None
54 |
55 | def fetch_additional_information(self, fileinfo, filepath, filetype, idxcol=None, parse_cols=None, page=0):
56 | """
57 | Reads an additional file with information
58 | It searches the file where the index_name matches tablename or dbname
59 | :param file:
60 | :param index_name:
61 | :return: additional info
62 | """
63 | #if isinstance(parse_cols, list): parse_cols = [parse_cols]
64 | additional_filepath = path.normpath(f"{filepath}/**/*{fileinfo.dbname}.{filetype}")
65 | file = glob.glob(additional_filepath,recursive=True)
66 |
67 | len_files = len(file)
68 | if len_files > 1:
69 | self.cpr.printex("More than one additional information file was found!")
70 | return None
71 | if len_files == 0:
72 | self.cpr.printex("No additional information file was found!")
73 | return None
74 |
75 | file = file[0]
76 | current_db_and_table = {"db": fileinfo.dbname, "table": fileinfo.tablename}
77 | if filetype in ["xlsx","xls"]:
78 | infos = {}
79 | info_df = pd.read_excel(file)#.set_index("ProfileID")
80 | parse_cols.remove(idxcol)
81 | for db_and_table_id, current_db_and_tablename in current_db_and_table.items():
82 | infos[db_and_table_id] = {}
83 | for line, rubric_content in info_df.loc[info_df[idxcol]==current_db_and_tablename][parse_cols].to_dict(orient="index").items():
84 | for rubric, content in rubric_content.items():
85 | if rubric != idxcol:
86 | if infos[db_and_table_id].get(rubric,None) is None:
87 | infos[db_and_table_id][rubric] = content
88 | elif infos[db_and_table_id].get(rubric,None) != content:
89 | if not isinstance(infos[db_and_table_id][rubric], list): infos[db_and_table_id][rubric] = [infos[db_and_table_id][rubric]]
90 | infos[db_and_table_id][rubric].append(content)
91 | elif filetype == "json":
92 | with open(file, "r") as add_info_file:
93 | infos = json.load(add_info_file)
94 |
95 | for possible_db_or_tablenames in reversed(list(infos.keys())):
96 | possible_db_or_tablenames_orig = possible_db_or_tablenames # unchanged name
97 |
98 | if self.config.ADD_INFO_SIMPLIFIED_NAME_COMPARISON:
99 | psplit = possible_db_or_tablenames.split("-")
100 | possible_db_or_tablenames = psplit[0]
101 |
102 | if possible_db_or_tablenames not in current_db_and_table['table']:
103 | del infos[possible_db_or_tablenames_orig]
104 | else:
105 | for db_and_table_id, current_db_and_tablename in current_db_and_table.items():
106 | if possible_db_or_tablenames == current_db_and_tablename:
107 | infos[db_and_table_id] = infos[possible_db_or_tablenames_orig]
108 | del infos[possible_db_or_tablenames_orig]
109 | else:
110 | return None
111 | return infos
--------------------------------------------------------------------------------
/lib/snippet_ocr.py:
--------------------------------------------------------------------------------
1 | from os import path, makedirs
2 | import numpy as np
3 | from PIL import Image
4 | from tesserocr import PyTessBaseAPI, RIL, iterate_level
5 |
6 | class Snippet(object):
7 | """ This library works with bbox on the original image -
8 | - Snip the bbox out of the image
9 | - OCR the snippet with tesseract gives text and bbox per word and confidences per char
10 | - Store the snippet """
11 |
12 | def __init__(self):
13 | self.bbox = None
14 | self.imgpath = None
15 | self.imgname = None
16 | self.ftype = None
17 | self.fname = None
18 | self.img = None
19 | self.shape = None
20 | self.snippet = None
21 | self.result = None
22 | self.__ocr_settings = {"lang":"akf3","psm":6,"oem":3}
23 |
24 |
25 | def imread(self, imgpath):
26 | """Loads the image with PIL-Lib"""
27 | try:
28 | self.imgpath = imgpath
29 | self.imgname = path.basename(imgpath)
30 | self.ftype = self.imgname.split(".")[-1]
31 | if self.ftype.lower() not in ["jpg", "png", "bmp", "gif", "tiff"]:
32 | raise NameError
33 | self.img = Image.open(f"{self.imgpath}")
34 | self.snippet = self.img
35 | self.shape = list(self.img.tile[0][1]) #[:2]+self.img.tile[0][1][4:1:-1])
36 | self.bbox = self.shape
37 | except IOError:
38 | print(f"cannot open {self.imgpath}")
39 | except NameError:
40 | print(f"The image filetype {self.ftype} is not supported!")
41 | return True
42 |
43 | def save(self, snippetpath:str):
44 | """Saves the snippet"""
45 | try:
46 | if self.imgname is None:
47 | raise NameError
48 | if not path.exists(snippetpath):
49 | makedirs(snippetpath)
50 | bboxstr = "_".join(str(bboxval) for bboxval in self.bbox)
51 | self.fname = snippetpath + self.imgname.split(".")[0] + "_bbox_" + bboxstr + "." + ".".join(self.imgname.split(".")[1:])
52 | self.snippet.save(self.fname)
53 | except NameError:
54 | print("Please load an image first.")
55 | except Exception as E:
56 | print(f"{self.fname} could not be stored:{E}")
57 | return True
58 |
59 | def crop(self, bbox:list):
60 | """Snip the bboxarea out of the image"""
61 | try:
62 | if self.img is None:
63 | raise NameError
64 | if any(np.less(bbox[:2],self.shape[:2])) or any(np.greater(bbox[2:4],self.shape[2:4])):
65 | raise ValueError
66 | if not isinstance(bbox,list) or len(bbox) != 4:
67 | raise TypeError
68 | if bbox != self.bbox:
69 | self.bbox = bbox[:]
70 | self.snippet = self.img.crop(self.bbox)
71 | except TypeError:
72 | print("The bbox has not the right type or format.")
73 | except NameError:
74 | print("Please load an image first.")
75 | except ValueError as E:
76 | print(f"The bbox shape doesnt match the image shape. {E}")
77 | except Exception as E:
78 | print(E)
79 | else:
80 | return True
81 | return False
82 |
83 | @property
84 | def ocr_settings(self):
85 | return self.__ocr_settings
86 |
87 | @ocr_settings.setter
88 | def ocr_settings(self, lang=None,psm=None,oem=None):
89 | """Set the parameter from tesseracts"""
90 | if lang is not None:
91 | self.__ocr_settings["lang"] = lang
92 | if psm is not None:
93 | self.__ocr_settings["psm"] = psm
94 | if oem is not None:
95 | self.__ocr_settings["oem"] = oem
96 | return
97 |
98 | def to_text(self):
99 | """Performs tesseract on the snippet"""
100 | try:
101 | if self.bbox is None:
102 | raise ValueError
103 | with PyTessBaseAPI(**self.ocr_settings) as api:
104 | api.SetImage(self.snippet)
105 | api.Recognize()
106 | ri = api.GetIterator()
107 | conf = []
108 | line = -1
109 | self.result=[]
110 | for r in iterate_level(ri, RIL.SYMBOL):
111 | if r.Empty(RIL.TEXTLINE):continue
112 | if r.IsAtBeginningOf(RIL.TEXTLINE):
113 | line += 1
114 | self.result.append({"text":"","words":[],"charconf":[],"bbox":[]})
115 | self.result[line]["text"] = r.GetUTF8Text(RIL.TEXTLINE)
116 | #print(r.GetUTF8Text(RIL.TEXTLINE))
117 | if r.IsAtFinalElement(RIL.WORD,RIL.SYMBOL):
118 | self.result[line]["words"].append(r.GetUTF8Text(RIL.WORD))
119 | self.result[line]["bbox"].append(r.BoundingBoxInternal(RIL.WORD))
120 | self.result[line]["charconf"].append(conf)
121 | conf = []
122 | conf.append(r.Confidence(RIL.SYMBOL))
123 | if conf:
124 | self.result[line]["charconf"].append(conf)
125 | except ValueError:
126 | print("Please first set the bbox value with snip_bbox.")
127 | return True
128 |
129 | @property
130 | def text(self):
131 | if self.result:
132 | text = ""
133 | for line in self.result:
134 | text += line["text"]
135 | return text
136 | else:
137 | return ""
138 |
139 |
140 |
141 |
142 |
143 |
144 |
--------------------------------------------------------------------------------
/configuration/config_parse_hocr_jk.conf:
--------------------------------------------------------------------------------
1 |
2 | INPUT_FILETYPES = [hocr, untype]
3 | #INPUT_FILEGLOB = ./AKFII_ocromore_results_local/msa_best/**/*. # local test folder
4 | # INPUT_FILEGLOB = /media/johannes/AKFII/AKF/AKFII_ocromore_results/msa_best/**/*. # this is the hocr-output of ocromore
5 | # INPUT_FILEGLOB = /media/sf_Transfer/testfiles_hocr/**/*.
6 | # INPUT_FILEGLOB = laptopdata/testfiles_hocr/**/*.
7 | INPUT_FILEGLOB = /media/sf_ShareVB/msa_best/all_years/**/*. # jk this is the hocr-output of ocromore
8 |
9 | USE_SNIPPET = True # Use the snippet tool for reocring snippets of the orig image
10 | IMAGE_PATH = /media/sf_ShareVB/ # Storing path
11 | DRAW_SEPARATOR = False # Save tablecutouts with separator drawn
12 | SAVE_SNIPPET= False # Use Toolbbox methods (you have to installed tesseract, tesserocr)
13 | IMGPATH = ./img/ # ./ -> relative to inputpath
14 | OPATH = ./img/snippets/ # ./ -> relative to inputpath
15 |
16 | INPUT_TABLE_DICTIONARY = ./additionals/dictionaries/ # Path to dictionaries
17 | USE_TABLE_DICTIONARY = True # Use to dictionaries to correct, split and find order level
18 |
19 | STORE_OCCURENCES = True # Storing occruencies of itemnames (tables)
20 | OCCURENCES_TABLETYPE = all # Tabletype to store [datatable_income,datatable_balance, all]
21 |
22 |
23 | [Additional informations settings]
24 | ADDITIONAL_INFORMATION = True
25 | INPUT_ADDINFOPATH = /media/sf_ShareVB/many_years_firmprofiles/additional_information/ #Additional information files
26 | #INPUT_ADDINFOPATH = /media/sf_Transfer/additional_information/ #Additional information files
27 | INPUT_ADDINFOFILETPYE = json
28 | IDXCOL = ProfileID # Column name which is matched with the tablenamen
29 | PARSE_COLS = [LABEL,ProfileID] # Columns which should be parsed to the add info
30 |
31 | TABLENAME_POS = 1 # in example '0585_...hocr'
32 | OCR_PROFILE_POS = 3 # in example: 'default'
33 | OCR_POS = 4 # in example: 'tess'
34 | DBPATH_POS = 2 # in example: '1969'
35 |
36 |
37 | OUTPUT_ROOT_PATH = ./output/
38 |
39 | [Segmentation settings]
40 | ADD_INFO_SIMPLIFIED_NAME_COMPARISON = True # in the additional info handler, simplify the table name comparison
41 | REMATCH_START_CONDITION_UNTIL_ZERO_ERROR = True
42 | MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION = True # do the index matching until the next start tag, or,- if defined, to the next explicitly recognized stop tag, if False only Start Tags are set to Index field
43 | FILTER_UNCATEGORIES_OVERALL = True # filter the tags which are in known_uncategories in the accumulated segmenation report
44 | #todo add multimatch output in logging
45 | [Parsing settings]
46 | ADD_FULLTEXT_ENTRY = True # adds an entry at the start of json which contains the complete text to parse for verification
47 | ADD_ADDITIONAL_INFO = True # adds the additional information to the output file
48 | ADD_INFO_ENTRY_TO_OUTPUT = True # add entries to output, which contain general information about the parsed segment
49 | REMOVE_TAGS_IN_ORIG_DIFF = True # try to remove leading tags from rest in parsed output to original difference
50 | REMOVE_SPACES_IN_ORIGIN_DIFF = True # removes all spaces from rest and comparison values because spaces are often a problem in subtracting the rests
51 | USE_DICTIONARIES_FOR_PERSON_PARSING = True # uses dictionaries for function and title for the parsing and better recognition of persons
52 |
53 |
54 | [Analysis Settings]
55 | LOG_PARSED_SEGMENTED_OUTPUT = True # logs the parsed results in a file for each segmentation tag
56 | LOG_SIMPLE = False # Just simple and fast logging (without tablerecognition)
57 | LOG_PARSED_TO_ORIG_DIFF_PER_CATEGORY = True # logs the difference of parsed result and original segmented output for specific category
58 | LOG_PARSED_TO_ORIG_ADD_OUTPUT_JSON = False # in above logging add the output-json to the diff files
59 | LOG_PARSED_TO_ORIG_DIFF_ACCUMULATED = True # creates an accumulated report for differences from parsed to segmented output for each folder/akf-year
60 | LOG_SEGMENTED_TO_ORIG_DIFF_PER_FILE = True # (needs ADD_FULLTEXT_ENTRY enabled) logs the difference of segmented result and original segmented output for specific file/akf-table
61 | LOG_SEGMENTED_TO_ORIG_ADD_OUTPUT_JSON = True # in above logging add the output-json to the diff files
62 | LOG_SEGMENTED_TO_ORIG_DIFF_ACCUMULATED = True # creates an accumulated report for differences from segmented to original output for each folder/akf-year
63 | JOIN_SEGMENTED_TEXTS_IN_ORIG_DIFF_PER_CATEGORY = True # the segmented texts get joined by algorithm which removes dashes and so on
64 |
65 |
66 | [Print and logging settings]
67 | PRINT_WARNING_LEVEL = True # print warnings except activation in class print settings
68 | PRINT_EXCEPTION_LEVEL = True # print exceptions except activation in class print settings
69 |
70 | PRINT_MAIN = True
71 | PRINT_FEATURE_EXTRACTOR = False
72 | PRINT_ADDITIONAL_INFO_HANDLER = True
73 | PRINT_SEGMENT_CLASSIFIER = True
74 | PRINT_SEGMENT_PARSER = True
75 | PRINT_SEGMENT_PARSER_AKF_FN_ONE = False # print parsing functions related to AKF (File one)
76 | PRINT_SEGMENT_PARSER_AKF_FN_TWO = False # print parsing functions related to AKF (File two)
77 | PRINT_SEGMENT_PARSER_AKF_FN_THREE = True # print parsing functions related to AKF (File three)
78 | PRINT_SEGMENT_PARSER_AKF_FN_TABLES_ONE = True # print parsing functions related to AKF (Table specific one)
79 | PRINT_OUTPUT_ANALYSIS = False
80 | PRINT_DICTIONARY_HANDLER = True # print output related to dictionary handler
--------------------------------------------------------------------------------
/configuration/config_parse_hocr_js.conf:
--------------------------------------------------------------------------------
1 |
2 | INPUT_FILETYPES = [hocr, untype]
3 | #INPUT_FILEGLOB = ./AKFII_ocromore_results_local/msa_best/**/*. # local test folder
4 | # INPUT_FILEGLOB = /media/johannes/AKFII/AKF/AKFII_ocromore_results/msa_best/**/*. # this is the hocr-output of ocromore
5 | INPUT_FILEGLOB = /media/sf_Transfer/AKFII_results/**/*.
6 | # INPUT_FILEGLOB = laptopdata/testfiles_hocr/**/*.
7 | # INPUT_FILEGLOB = /media/sf_ShareVB/many_years_firmprofiles_output/AKFII/long/**/*. # jk this is the hocr-output of ocromore
8 |
9 | USE_SNIPPET = True # Use the snippet tool for reocring snippets of the orig image
10 | IMAGE_PATH = /media/sf_ShareVB/ # Storing path
11 | DRAW_SEPARATOR = False # Save tablecutouts with separator drawn
12 | SAVE_SNIPPET= False # Use Toolbbox methods (you have to installed tesseract, tesserocr)
13 | IMGPATH = ./img/ # ./ -> relative to inputpath
14 | OPATH = ./img/snippets/ # ./ -> relative to inputpath
15 |
16 | INPUT_TABLE_DICTIONARY = ./additionals/dictionaries/ # Path to dictionaries
17 | USE_TABLE_DICTIONARY = True # Use to dictionaries to correct, split and find order level
18 |
19 | STORE_OCCURENCES = False # Storing occruencies of itemnames (tables)
20 | OCCURENCES_TABLETYPE = datatable_income # Tabletype to store [datatable_income,datatable_balance]
21 |
22 | [Additional informations settings]
23 | ADDITIONAL_INFORMATION = True
24 | # INPUT_ADDINFOPATH = /media/sf_ShareVB/many_years_firmprofiles/additional_information/ #Additional information files
25 | INPUT_ADDINFOPATH = /media/sf_Transfer/additional_information/ #Additional information files
26 | INPUT_ADDINFOFILETPYE = json
27 | IDXCOL = ProfileID # Column name which is matched with the tablenamen
28 | PARSE_COLS = [LABEL,ProfileID] # Columns which should be parsed to the add info
29 |
30 | TABLENAME_POS = 1 # in example '0585_...hocr'
31 | OCR_PROFILE_POS = 3 # in example: 'default'
32 | OCR_POS = 4 # in example: 'tess'
33 | DBPATH_POS = 2 # in example: '1969'
34 |
35 |
36 | OUTPUT_ROOT_PATH = ./output/
37 |
38 | [Segmentation settings]
39 | ADD_INFO_SIMPLIFIED_NAME_COMPARISON = True # in the additional info handler, simplify the table name comparison
40 | REMATCH_START_CONDITION_UNTIL_ZERO_ERROR = True
41 | MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION = True # do the index matching until the next start tag, or,- if defined, to the next explicitly recognized stop tag, if False only Start Tags are set to Index field
42 | FILTER_UNCATEGORIES_OVERALL = True # filter the tags which are in known_uncategories in the accumulated segmenation report
43 | #todo add multimatch output in logging
44 | [Parsing settings]
45 | ADD_FULLTEXT_ENTRY = True # adds an entry at the start of json which contains the complete text to parse for verification
46 | ADD_ADDITIONAL_INFO = True # adds the additional information to the output file
47 | ADD_INFO_ENTRY_TO_OUTPUT = True # add entries to output, which contain general information about the parsed segment
48 | REMOVE_TAGS_IN_ORIG_DIFF = True # try to remove leading tags from rest in parsed output to original difference
49 | REMOVE_SPACES_IN_ORIGIN_DIFF = True # removes all spaces from rest and comparison values because spaces are often a problem in subtracting the rests
50 | USE_DICTIONARIES_FOR_PERSON_PARSING = True # uses dictionaries for function and title for the parsing and better recognition of persons
51 |
52 |
53 | [Analysis Settings]
54 | LOG_PARSED_SEGMENTED_OUTPUT = True # logs the parsed results in a file for each segmentation tag
55 | LOG_SIMPLE = True # Just simple and fast logging (without tablerecognition)
56 | LOG_PARSED_TO_ORIG_DIFF_PER_CATEGORY = True # logs the difference of parsed result and original segmented output for specific category
57 | LOG_PARSED_TO_ORIG_ADD_OUTPUT_JSON = False # in above logging add the output-json to the diff files
58 | LOG_PARSED_TO_ORIG_DIFF_ACCUMULATED = True # creates an accumulated report for differences from parsed to segmented output for each folder/akf-year
59 | LOG_SEGMENTED_TO_ORIG_DIFF_PER_FILE = True # (needs ADD_FULLTEXT_ENTRY enabled) logs the difference of segmented result and original segmented output for specific file/akf-table
60 | LOG_SEGMENTED_TO_ORIG_ADD_OUTPUT_JSON = True # in above logging add the output-json to the diff files
61 | LOG_SEGMENTED_TO_ORIG_DIFF_ACCUMULATED = True # creates an accumulated report for differences from segmented to original output for each folder/akf-year
62 | JOIN_SEGMENTED_TEXTS_IN_ORIG_DIFF_PER_CATEGORY = True # the segmented texts get joined by algorithm which removes dashes and so on
63 |
64 | [Print and logging settings]
65 | PRINT_WARNING_LEVEL = True # print warnings except activation in class print settings
66 | PRINT_EXCEPTION_LEVEL = True # print exceptions except activation in class print settings
67 |
68 | PRINT_MAIN = True
69 | PRINT_FEATURE_EXTRACTOR = False
70 | PRINT_ADDITIONAL_INFO_HANDLER = True
71 | PRINT_SEGMENT_CLASSIFIER = True
72 | PRINT_SEGMENT_PARSER = True
73 | PRINT_SEGMENT_PARSER_AKF_FN_ONE = False # print parsing functions related to AKF (File one)
74 | PRINT_SEGMENT_PARSER_AKF_FN_TWO = False # print parsing functions related to AKF (File two)
75 | PRINT_SEGMENT_PARSER_AKF_FN_THREE = True # print parsing functions related to AKF (File three)
76 | PRINT_SEGMENT_PARSER_AKF_FN_TABLES_ONE = True # print parsing functions related to AKF (Table specific one)
77 | PRINT_OUTPUT_ANALYSIS = False
78 | PRINT_DICTIONARY_HANDLER = True # print output related to dictionary handler
--------------------------------------------------------------------------------
/additionals/dictionaries/dictionary_balance.json:
--------------------------------------------------------------------------------
1 | {
2 | "Zusatz":
3 | {
4 | "Aktiva":"",
5 | "Passiva": "",
6 | "darunter": "",
7 | "Sonstige": "",
8 | "Sonstiges": "",
9 | "Sonst.": "",
10 | "Langfristige": "",
11 | "Langfr." : "",
12 | "Kurzfristige":"",
13 | "Durchlaufende": "",
14 | "dauernde": ""
15 | },
16 | "Hauptpunkte":{
17 | "Eigenkapital": "Eigenkapital",
18 | "Fremdkapital": "Fremdkapital",
19 | "Gewinn nach Vortrag": "Gewinn nach Vortrag",
20 | "Anlagevermögen": "Anlagevermögen",
21 | "Umlaufvermögen": "Umlaufvermögen",
22 | "Verlust ohne Vortrag": "Verlust ohne Vortrag",
23 | "Verlust nach Vortrag": "Verlust nach Vortrag",
24 | "Passiva Einlagen": "Passiva Einlagen",
25 | "Aufgenommene Gelder": "Aufgenommene Gelder",
26 | "Barreserve":"Barreserve",
27 | "Nostroguthaben":"Nostroguthaben",
28 | "Betriebserträge":"Betriebserträge",
29 | "Uraltguthaben": "Uraltguthaben",
30 | "Wertpapiere": "Wertpapiere",
31 | "Konsortialbeteiligungen": "Konsortialbeteiligungen",
32 | "Debitoren": "Debitoren",
33 | "Deckungsforderungen": "Deckungsforderungen",
34 | "Sonstige Aktiva": "Sonstige Aktiva",
35 | "Beteiligungen": "Beteiligungen",
36 | "Ausgleichsforderungen": "Ausgleichsforderungen",
37 | "Ausleihungen": "Ausleihungen",
38 | "Schuldverschreibungen": "Schuldverschreibungen",
39 | "Zinsen hierauf": "Zinsen hierauf",
40 | "Ausstehende Einlagen auf A.-K.": "Ausstehende Einlagen auf A.-K.",
41 | "Rückstellungen": "Rückstellungen",
42 | "Schuldner": "Schuldner",
43 | "Anzahlungen": "Anzahlungen",
44 | "Anlagen": "Anlagen",
45 | "Vorräte": "Vorräte",
46 | "Gläubiger": "Gläubiger",
47 | "Verbindlichkeiten": "Verbindlichkeiten",
48 | "Forderungen": "Forderungen",
49 | "Gewinn ohne Vortrag": "Gewinn ohne Vortrag",
50 | "Kapitalentwertungskonto": "Kapitalentwertungskonto",
51 | "Vermögen": "Vermögen",
52 | "Konzernunternehmen": "Konzernunternehmen",
53 | "Einlagen": "Einlagen",
54 | "Eigene Akzepte und Solawechsel": "Eigene Akzepte und Solawechsel",
55 | "Kapitalausgleichskonto": "Kapitalausgleichskonto",
56 | "Löhne und Gehälter": "Löhne und Gehälter",
57 | "Abschreibungen": "Abschreibungen",
58 | "Steuern": "Steuern",
59 | "Jahresertrag": "Jahresertrag",
60 | "Beteiligungserträge": "Beteiligungserträge",
61 | "Abwicklungsvermögen": "Abwicklungsvermögen",
62 | "Zinsen": "Zinsen",
63 | "Schuldverschreibungen im Umlauf": "Schuldverschreibungen im Umlauf",
64 | "Zinsen v. Ausleihungen": "Zinsen v. Ausleihungen",
65 | "Kapitalverlustkonto gemn. DMBG": "Kapitalverlustkonto gemn. DMBG",
66 | "ao. Kapitalentwertungskonto": "Kapitalentwertungsskonto",
67 | "Pensions-Rückstellungen": "Pensions-Rückstellungen",
68 | "Anleihen im Umlauf": "Anleihen im Umlauf",
69 | "Grundstücke und Gebäude": "Grundstücke und Gebäude",
70 | "Hypotheken u. Kommunaldarlehen": "Hypotheken u. Kommunaldarlehen",
71 | "Aufgenommene Darlehen": "Aufgenommene Darlehen",
72 | "Anlagewerte": "Anlagewerte",
73 | "Rechnungsabgrenzung": "Rechnungsabgrenzung",
74 | "Liquidationskapital": "Liquidationskapital",
75 | "Ao.Kap.-Entw.Konto": "Ao.Kap.-Entw.Konto",
76 | "Hypotheken und Darlehen": "Hypotheken und Darlehen",
77 | "Kommunaldarlehen": "Kommunaldarlehen",
78 | "Aufgenommene langfr.Darlehen": "Aufgenommen langfr. Darlehen",
79 | "Grundkapital u. ges.Rücklage": "Grundkapital u. ges. Rücklage",
80 | "Bilanzsumme": "Bilanzsumme",
81 | "Guthaben bei Kreditinstituten": "Guthaben bei Kreditinstituten",
82 | "Kredite": "Kredite",
83 | "langfristige Darlehen u.Anleihen": "langfristige Darlehen u.Anleihen",
84 | "lagen auf das Grundkapital": "Ausstehende Einlagen auf das Grundkapital",
85 | "Abschreibungen auf Anlagen":"Abschreibungen auf Anlagen",
86 | "Technische Rückstellungen":"Technische Rückstellungen",
87 | "Allgemeine Rückstellungen":"Allgemeine Rückstellungen",
88 | "Kapital": "Kapital",
89 | "Rücklagen": "Rücklagen",
90 | "Reingewinn": "Reingewinn",
91 | "Eigenmittel": "Eigenmittel",
92 | "Bilanzgewinn": "Bilanzgewinn",
93 | "Hauptpunkte":"Hauptpunkte",
94 | "Restliche Passiva":"Restliche Passiva",
95 | "Spareinlagen":"Spareinlagen",
96 | "Bilanzverlust":"Bilanzverlust",
97 | "Forderungen an Kreditinstitute":"Forderungen an Kreditinstitute",
98 | "Wertberichtigungen":"Wertberichtigungen",
99 | "Ausstehende Einlagen auf das Grundkapital":"Ausstehende Einlagen auf das Grundkapital",
100 | "Rückstellung für LAG-Vermögensabgabe": "Rückstellungen",
101 | "Rückstellung für Pensionsverpfl.": "Rückstellungen",
102 | "Gewinn einschl. Vortrag": "Gewinn nach Vortrag",
103 | "Entwertungskonto": "Entwertungskonto",
104 | "Kapitalverlustkonto": "Kapitalverlustkonto",
105 | "Bankguthaben": "Bankguthaben",
106 | "Vermögensunterdeckung":"Vermögensunterdeckung",
107 | "Versicherungstechnische Rückstellungen":"Versicherungstechnische Rückstellungen",
108 | "Wertpapieranlagen": "Wertpapieranlagen",
109 | "Nichtversicherungstechnische Rückstellungen":"Nichtversicherungstechnische Rückstellungen",
110 | "Grunkapital":"Grundkapital",
111 | "Einbehaltene Gewinne":"Einbehaltene Gewinne",
112 | "ohne Vortrag": "ohne Vortrag"
113 | },
114 | "Unterpunkte":{
115 | "davon A.-K.": "davon A.-K.",
116 | "davon AK": "davon A.-K.",
117 | "Vorräte": "Vorräte",
118 | "Lieferforderungen": "Lieferforderungen",
119 | "Barmittel": "Barmittel",
120 | "Barmittel einschl. Wertpapiere": "Barmittel",
121 | "Flüssige Mittel": "Flüssige Mittel",
122 | "Beteiligungen": "Beteiligungen",
123 | "Grundkapital": "Grundkapital",
124 | "Aktien und Beteiligungen": "Aktien und Beteiligungen",
125 | "Betriebsstoffe und Waren": "Betriebsstoffe und Waren",
126 | "Forderungen aus Mieten": "Forderungen aus Mieten"
127 | }
128 | }
--------------------------------------------------------------------------------
/main_start.py:
--------------------------------------------------------------------------------
1 | # custom imports
2 | from akf_corelib.configuration_handler import ConfigurationHandler
3 | from akf_corelib.database_handler import DatabaseHandler
4 | from akf_corelib.conditional_print import ConditionalPrint
5 | from lib.dictionary_handler import DictionaryHandler
6 | from lib.feature_extractor import FeatureExtractor
7 | from lib.segment_classifier import SegmentClassifier
8 | from lib.segment_parser import SegmentParser
9 | from lib.output_analysis import OutputAnalysis
10 | from lib.additional_info_handler import AdditionalInfoHandler
11 |
12 | # load configuration and printer
13 | CODED_CONFIGURATION_PATH = './configuration/config_parse_hocr_js.conf'
14 | config_handler = ConfigurationHandler(first_init=True, fill_unkown_args=True, \
15 | coded_configuration_paths=[CODED_CONFIGURATION_PATH])
16 | config = config_handler.get_config()
17 | cpr = ConditionalPrint(config.PRINT_MAIN, config.PRINT_EXCEPTION_LEVEL,
18 | config.PRINT_WARNING_LEVEL, leading_tag="main_start")
19 |
20 | # Basic steps:
21 | feature_extractor = FeatureExtractor()
22 | add_info_handler = AdditionalInfoHandler()
23 | dictionary_handler = DictionaryHandler()
24 | segment_classifier = SegmentClassifier()
25 | output_analyzer = OutputAnalysis()
26 | segment_parser = SegmentParser(output_analyzer, dictionary_handler)
27 |
28 |
29 | dh = DatabaseHandler(dbdir="")
30 | dh.set_dirpos(tablename_pos=config.TABLENAME_POS,ocr_profile_pos=config.OCR_PROFILE_POS,\
31 | ocr_pos=config.OCR_POS, dbname_pos=config.DBPATH_POS)
32 |
33 | dh.fetch_files(config.INPUT_FILEGLOB, config.INPUT_FILETYPES)
34 | # get files-list
35 | hocr_files = dh.get_files()
36 |
37 | accumulated_tags = {}
38 |
39 | # main iteration loop
40 | for key in hocr_files:
41 | #if "1956" not in key:
42 | # continue
43 | int_key = int(key)
44 | if int_key < 1973 or int_key > 1973: # start from 1971
45 | continue
46 |
47 | accumulated_diff_info = output_analyzer.AccumulatedInfo()
48 | accumulated_diff_info_categories = {}
49 | accumulated_diff_info_orig_to_segment = {}
50 |
51 | ocromore_data = None
52 | ctr_test = 1
53 |
54 | my_list = hocr_files[key]
55 | for file in my_list:
56 | #if "msa_best" not in file.ocr_profile:
57 | # continue
58 |
59 | # only check files which are relevant (comment out if not used)
60 | # Sitz ok: 72, 207,671, 731, 733
61 | # Sitz faulty: 270,454
62 | if ctr_test not in [151]:
63 | ctr_test += 1
64 | continue
65 |
66 | #split = file.name.split("_")
67 | #if int(split[1]) < 1968:
68 | # continue
69 | #if int(split[0])<300:
70 | # continue
71 | #if not "_1956" in file.name:
72 | # continue
73 | # fetch additional information for current file (if toggled in info)
74 | additional_info = add_info_handler.fetch_additional_information_simple(file)
75 |
76 | # fetch basic data for current file
77 | ocromore_data = dh.fetch_ocromore_data(file, additional_info=additional_info)
78 | output_analyzer.set_current_data(ocromore_data) # prepare output analyzer
79 |
80 | cpr.print("Checking file:", ocromore_data['file_info'].path)
81 |
82 | # extract features from basic data
83 | ocromore_data = feature_extractor.extract_file_features(ocromore_data)
84 | # line segmentation
85 | ocromore_data = segment_classifier.classify_file_segments(ocromore_data)
86 | # segment parsing
87 | ocromore_data = segment_parser.parse_segments(ocromore_data)
88 | # output file synthesis
89 | segment_parser.write_result_to_output(True, ocromore_data)
90 | # todo
91 | # output analysis steps
92 | output_analyzer.log_segmentation_simple(ocromore_data) # log the recognized segmentation
93 | output_analyzer.log_parsed_output(ocromore_data) # log the parsed segments into tag-based files
94 | diff_info_orig_to_segment = output_analyzer.log_original_to_segment_diff(ocromore_data, use_delimiters=False) # log the difference of segmented data to original data
95 | diff_info_categories = output_analyzer.log_segmentation_diff_orig_to_parsed_output(ocromore_data) # log the segmentation
96 | diff_info = output_analyzer.log_unsegmentated(ocromore_data)
97 | accumulated_diff_info_categories = \
98 | output_analyzer.accumulate_diff_info_output_to_orig(diff_info_categories, accumulated_diff_info_categories)
99 | accumulated_diff_info_orig_to_segment = \
100 | output_analyzer.accumulate_diff_info_orig_to_segmentation(diff_info_orig_to_segment, accumulated_diff_info_orig_to_segment)
101 |
102 | accumulated_diff_info = output_analyzer.accumulate_diff_info(ocromore_data, diff_info, accumulated_diff_info)
103 | accumulated_tags = output_analyzer.log_tags(ocromore_data, accumulated_tags)
104 | ctr_test += 1
105 |
106 | if ctr_test >= 30:
107 | break
108 |
109 | # clear the current result in segment_parser cache to parse the next one
110 | segment_parser.clear_result(output_analyzer, dictionary_handler)
111 |
112 | # output analysis:
113 | # print diff info for this year (accumulated over all tables/year)
114 | output_analyzer.log_accumulated_unsegmentated(accumulated_diff_info, ocromore_data)
115 | # print the amount of chars which is left for each category after parsing for this year
116 | output_analyzer.log_accumulated_orig_to_parsed_output(accumulated_diff_info_categories, ocromore_data)
117 | # print diff info for this year between original and segmentation
118 | output_analyzer.log_accumulated_orig_to_segment(accumulated_diff_info_orig_to_segment, ocromore_data)
119 |
120 |
121 | output_analyzer.log_accumulated_tags(accumulated_tags)
122 |
--------------------------------------------------------------------------------
/parser.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Rafa Haro '
2 |
3 | from abc import ABCMeta, abstractmethod
4 | from bs4 import BeautifulSoup
5 | import re
6 |
7 |
8 | class HOCRElement:
9 |
10 | __metaclass__ = ABCMeta
11 |
12 | COORDINATES_PATTERN = re.compile("bbox\s(-?[0-9]+)\s(-?[0-9]+)\s(-?[0-9]+)\s(-?[0-9]+)")
13 |
14 | def __init__(self, hocr_html, parent, next_tag, next_attribute, next_class):
15 | self.__coordinates = (0, 0, 0, 0)
16 | self._hocr_html = hocr_html
17 | self._id = None
18 | self._parent = parent
19 | self._elements = self._parse(next_tag, next_attribute, next_class)
20 |
21 | def _parse(self, next_tag, next_attributte, next_class):
22 |
23 | try:
24 | self._id = self._hocr_html['id']
25 | except KeyError:
26 | self._id = None
27 |
28 | try:
29 | title = self._hocr_html['title']
30 | match = HOCRElement.COORDINATES_PATTERN.search(title)
31 | if match:
32 | self.__coordinates = (int(match.group(1)),
33 | int(match.group(2)),
34 | int(match.group(3)),
35 | int(match.group(4)))
36 | else:
37 | raise ValueError("The HOCR element doesn't contain a valid title property")
38 | except KeyError:
39 | self.__coordinates = (0, 0, 0, 0)
40 |
41 | elements = []
42 | if next_tag is not None and next_class is not None:
43 | for html_element in self._hocr_html.find_all(next_tag, {'class':next_attributte}):
44 | elements.append(next_class(self, html_element))
45 | return elements
46 |
47 | @property
48 | def coordinates(self):
49 | return self.__coordinates
50 |
51 | @property
52 | def html(self):
53 | return self._hocr_html.prettify()
54 |
55 | @property
56 | def id(self):
57 | return self._id
58 |
59 | @property
60 | def parent(self):
61 | return self._parent
62 |
63 | def __hash__(self):
64 | return hash(self._id)
65 |
66 | def __eq__(self, other):
67 | if not isinstance(other, HOCRElement):
68 | return False
69 | else:
70 | return self._id == other._id
71 |
72 | @property
73 | @abstractmethod
74 | def ocr_text(self):
75 | pass
76 |
77 | class HOCRDocument(HOCRElement):
78 |
79 | def __init__(self, source, is_path=False):
80 |
81 | if not is_path:
82 | hocr_html = BeautifulSoup(source, 'html.parser')
83 | else:
84 | hocr_html = BeautifulSoup(open(source, 'r', encoding="utf-8").read(), 'html.parser')
85 |
86 | super(HOCRDocument, self).__init__(hocr_html, None, 'div', Page.HOCR_PAGE_TAG, Page)
87 |
88 | @property
89 | def ocr_text(self):
90 | output = ""
91 | for element in self._elements[:-1]:
92 | output += element.ocr_text
93 | output += "\n\n"
94 | output += self._elements[-1].ocr_text
95 | return output
96 |
97 | @property
98 | def pages(self):
99 | return self._elements
100 |
101 | @property
102 | def npages(self):
103 | return len(self._elements)
104 |
105 | @property
106 | def ocr(self):
107 | for tag in self._hocr_html.find_all("meta"):
108 | if "esseract" in tag.get("content",None):
109 | return "Tess"
110 | if "cropy" in tag.get("content",None):
111 | return "Ocro"
112 | if "ABBYY" in tag.get("content",None):
113 | return "Abbyy"
114 | return "Abbyy"
115 |
116 | class Page(HOCRElement):
117 |
118 | HOCR_PAGE_TAG = "ocr_page"
119 |
120 | def __init__(self, parent, hocr_html):
121 | super(Page, self).__init__(hocr_html, parent, 'div', Area.HOCR_AREA_TAG, Area)
122 |
123 | @property
124 | def ocr_text(self):
125 | output = ""
126 | for element in self._elements[:-1]:
127 | output += element.ocr_text
128 | output += "\n\n"
129 | output += self._elements[-1].ocr_text
130 | return output
131 |
132 | @property
133 | def areas(self):
134 | return self._elements
135 |
136 | @property
137 | def nareas(self):
138 | return len(self._elements)
139 |
140 | class Area(HOCRElement):
141 |
142 | HOCR_AREA_TAG = "ocr_carea"
143 |
144 | def __init__(self, parent, hocr_html):
145 | super(Area, self).__init__(hocr_html, parent, 'p', Paragraph.HOCR_PAR_TAG, Paragraph)
146 |
147 | @property
148 | def paragraphs(self):
149 | return self._elements
150 |
151 | @property
152 | def nparagraphs(self):
153 | return len(self._elements)
154 |
155 | @property
156 | def ocr_text(self):
157 | output = ""
158 | for element in self._elements[:-1]:
159 | output += element.ocr_text
160 | output += "\n"
161 | output += self._elements[-1].ocr_text
162 | return output
163 |
164 | class Paragraph(HOCRElement):
165 |
166 | HOCR_PAR_TAG = "ocr_par"
167 |
168 | def __init__(self, parent, hocr_html):
169 | super(Paragraph, self).__init__(hocr_html, parent, 'span', Line.HOCR_LINE_TAG, Line)
170 |
171 | @property
172 | def lines(self):
173 | return self._elements
174 |
175 | @property
176 | def nlines(self):
177 | return len(self._elements)
178 |
179 | @property
180 | def ocr_text(self):
181 | output = ""
182 | for element in self._elements[:-1]:
183 | output += element.ocr_text
184 | output += "\n"
185 | output += self._elements[-1].ocr_text
186 | return output
187 |
188 | class Line(HOCRElement):
189 |
190 | HOCR_LINE_TAG = "ocr_line"
191 |
192 | def __init__(self, parent, hocr_html):
193 | super(Line, self).__init__(hocr_html, parent, 'span', Word.HOCR_WORD_TAG, Word)
194 | self._ocr_text_normalized = None # custom property, none if not assigned
195 |
196 |
197 | @property
198 | def words(self):
199 | return self._elements
200 |
201 | @property
202 | def nwords(self):
203 | return len(self._elements)
204 |
205 | @property
206 | def ocr_text(self):
207 | output = ""
208 | for element in self._elements[:-1]:
209 | output += element.ocr_text
210 | output += " "
211 | output += self._elements[-1].ocr_text
212 | return output
213 |
214 | @property
215 | def ocr_text_normalized(self):
216 | return self._ocr_text_normalized
217 |
218 | @ocr_text_normalized.setter
219 | def ocr_text_normalized(self, new_text):
220 | self._ocr_text_normalized = new_text
221 |
222 | class Word(HOCRElement):
223 |
224 | HOCR_WORD_TAG = "ocrx_word"
225 | _xwconf = None
226 | _xconfs = None
227 |
228 | def __init__(self, parent, hocr_html):
229 | super(Word, self).__init__(hocr_html, parent, None, None, None)
230 | title = hocr_html.attrs['title']
231 | titlesplit = title.split(';')
232 | for element in titlesplit:
233 | if 'x_wconf' in element:
234 | self._xwconf = element.strip().split(' ')[1]
235 | if "x_confs" in element:
236 | self._xconfs = element.strip().split(' ')[1:]
237 | break
238 |
239 |
240 | @property
241 | def ocr_text(self):
242 | word = self._hocr_html.string
243 | if word is not None:
244 | return word
245 | else:
246 | return ""
--------------------------------------------------------------------------------
/lib/akf_parsing_functions_jk.py:
--------------------------------------------------------------------------------
1 | from akf_corelib.conditional_print import ConditionalPrint
2 | from akf_corelib.configuration_handler import ConfigurationHandler
3 | from .akf_parsing_functions_common import AKFCommonParsingFunctions as cf
4 | from lib.table_parser import Datatable, Sharetable, Dividendtable
5 | import time
6 |
7 | def timeit(method):
8 | def timed(*args, **kw):
9 | ts = time.time()
10 | result = method(*args, **kw)
11 | te = time.time()
12 |
13 | if 'log_time' in kw:
14 | name = kw.get('log_name', method.__name__.upper())
15 | kw['log_time'][name] = int((te - ts) * 1000)
16 | else:
17 | print('%r %2.2f ms' % \
18 | (method.__name__, (te - ts) * 1000))
19 | return result
20 |
21 | return timed
22 |
23 | class AkfParsingFunctionsJK(object):
24 |
25 | def __init__(self, endobject_factory, output_analyzer, dictionary_handler, ocromore_data=None):
26 | config_handler = ConfigurationHandler(first_init=False)
27 |
28 | self.config = config_handler.get_config()
29 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_THREE, self.config.PRINT_EXCEPTION_LEVEL,
30 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
31 |
32 | self.cpr.print("init akf parsing functions three")
33 |
34 | self.ef = endobject_factory
35 | self.output_analyzer = output_analyzer
36 | self.ocromore_data = ocromore_data
37 | self.dictionary_handler = dictionary_handler
38 |
39 | def parse_bilanzen(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
40 | # get basic data
41 | element_counter = 0
42 | origpost, origpost_red, element_counter, content_texts = \
43 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
44 |
45 | # logme
46 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)
47 |
48 | # init
49 | only_add_if_string = True
50 | if self.config.LOG_SIMPLE:
51 | geschaeftslage = origpost_red.replace("- ", "")
52 |
53 | #parsing
54 | self.ef.add_to_my_obj("balances", geschaeftslage, object_number=element_counter,only_filled=only_add_if_string)
55 | return True
56 | #parsing
57 | table = Datatable(snippet=segmentation_class.snippet)
58 | table.analyse_structure(content_lines,feature_lines, template="datatable_balance")
59 | table.extract_content(content_lines, feature_lines, template="datatable_balance")
60 |
61 | # Write information for income table parsing
62 | segmentation_class.info_handler["income"] = {}
63 | segmentation_class.info_handler["income"]["amount"] = table.info.amount
64 | segmentation_class.info_handler["income"]["col"] = table.info.col
65 | segmentation_class.info_handler["income"]["separator"] = table.info.separator
66 |
67 | # Parsing the tables based on whitespace and number of numbers of each group
68 | # This should be the last option to parse (error-prone)
69 | self.ef.add_to_my_obj("balances", table.content, object_number=element_counter,only_filled=only_add_if_string)
70 |
71 | def parse_gewinn_und_verlust(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
72 | # get basic data
73 | element_counter = 0
74 | origpost, origpost_red, element_counter, content_texts = \
75 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
76 |
77 | # logme
78 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)
79 |
80 | # init
81 | only_add_if_string = True
82 | if self.config.LOG_SIMPLE:
83 | geschaeftslage = origpost_red.replace("- ", "")
84 |
85 | #parsing
86 | self.ef.add_to_my_obj("income", geschaeftslage, object_number=element_counter,only_filled=only_add_if_string)
87 | return True
88 |
89 | # parsing
90 | table = Datatable(snippet=segmentation_class.snippet)
91 | table.analyse_structure(content_lines, feature_lines, template="datatable_income")
92 | if segmentation_class.info_handler and "income" in set(segmentation_class.info_handler.keys()):
93 | table.info.col = segmentation_class.info_handler["income"]["col"]
94 | table.info.amount = segmentation_class.info_handler["income"]["amount"]
95 | table.info.separator = segmentation_class.info_handler["income"]["separator"]
96 |
97 | table.extract_content(content_lines, feature_lines, template="datatable_income")
98 |
99 |
100 | #parsing
101 | self.ef.add_to_my_obj("income", table.content, object_number=element_counter,
102 | only_filled=only_add_if_string)
103 |
104 | def parse_aktienkurse(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
105 | # get basic data
106 | element_counter = 0
107 | origpost, origpost_red, element_counter, content_texts = \
108 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
109 |
110 | # logme
111 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)
112 |
113 | # init
114 | only_add_if_string = True
115 | #self.config.LOG_SIMPLE = False
116 | if self.config.LOG_SIMPLE:
117 | # self.config.LOG_SIMPLE = False
118 | skip = origpost_red.replace("- ", "")
119 |
120 | # parsing
121 | self.ef.add_to_my_obj("shares", skip, object_number=element_counter,
122 | only_filled=only_add_if_string)
123 | return True
124 |
125 | # parsing
126 | table = Sharetable(snippet=segmentation_class.snippet)
127 | table.analyse_structure(content_lines, feature_lines)
128 | table.extract_content(content_lines, feature_lines)
129 | #from timeit import timeit
130 | #print(timeit(test))
131 | # parsing
132 | self.ef.add_to_my_obj("shares", table.content, object_number=element_counter,
133 | only_filled=only_add_if_string)
134 |
135 | def parse_dividend(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
136 | # get basic data
137 | element_counter = 0
138 | origpost, origpost_red, element_counter, content_texts = \
139 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
140 |
141 | # logme
142 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)
143 |
144 | # init
145 | only_add_if_string = True
146 | # self.config.LOG_SIMPLE = True
147 | if self.config.LOG_SIMPLE:
148 | # self.config.LOG_SIMPLE = False
149 | skip = origpost_red.replace("- ", "")
150 |
151 | # parsing
152 | self.ef.add_to_my_obj("dividende", skip, object_number=element_counter,
153 | only_filled=only_add_if_string)
154 | return True
155 |
156 | # parsing
157 | table = Dividendtable(snippet=segmentation_class.snippet)
158 | table.analyse_structure(content_lines, feature_lines)
159 | table.extract_content(content_lines, feature_lines)
160 | # from timeit import timeit
161 | # print(timeit(test))
162 | # parsing
163 | self.ef.add_to_my_obj("dividende", table.content, object_number=element_counter,
164 | only_filled=only_add_if_string)
165 |
--------------------------------------------------------------------------------
/lib/feature_extractor.py:
--------------------------------------------------------------------------------
1 | from akf_corelib.conditional_print import ConditionalPrint
2 | from akf_corelib.configuration_handler import ConfigurationHandler
3 | from akf_corelib.random import Random
4 |
5 | import numpy as np
6 |
7 |
8 | class LineFeatures():
9 | counter_special_chars = -1
10 | counter_alphanumerical_chars = -1
11 | counter_numbers = -1
12 | counter_chars = -1
13 | counter_alphabetical = -1
14 | counter_words = -1
15 | counter_spaces = -1
16 | counters_alphabetical_ratios = []
17 | counters_wordlengths = []
18 | counters_numbers = []
19 | special_chars_ratio = -1
20 | alphanumerical_chars_ratio = -1
21 | alphabetical_ratio = -1
22 | spaces_ratio = -1
23 | numbers_ratio = -1
24 |
25 | x_box_sizes = []
26 | x_gaps = []
27 |
28 | maximum_x_gap = None
29 | mean_x_gap = None
30 | median_x_gap = None
31 |
32 | many_numbers_in_first_word = False
33 | many_alphabetical_in_middle_words = False
34 | many_alphabetical_in_last_word = False
35 |
36 | def __init__(self, cpr):
37 | self.cpr = cpr
38 |
39 | def print_me(self):
40 | self.cpr.print("alle cntr:", self.counter_chars)
41 | self.cpr.print("spec cntr:", self.counter_special_chars, "ratio", self.special_chars_ratio)
42 | self.cpr.print("alnr cntr:", self.counter_alphanumerical_chars, "ratio", self.alphanumerical_chars_ratio)
43 | self.cpr.print("albt cntr:", self.counter_alphabetical, "ratio", self.alphabetical_ratio)
44 | self.cpr.print("spce cntr:", self.counter_spaces, "ratio", self.spaces_ratio)
45 | self.cpr.print("nmbr cntr:", self.counter_numbers, "ratio", self.numbers_ratio)
46 | self.cpr.print("x_box_sizes", self.x_box_sizes)
47 | self.cpr.print("x_gaps", self.x_gaps)
48 | self.cpr.print("x_gap_max_size", self.maximum_x_gap)
49 | self.cpr.print("x_gaps_mean", self.mean_x_gap)
50 | self.cpr.print("x_gaps_median", self.median_x_gap)
51 |
52 | class FeatureExtractor():
53 |
54 | def __init__(self):
55 | config_handler = ConfigurationHandler(first_init=False)
56 |
57 | self.config = config_handler.get_config()
58 | self.cpr = ConditionalPrint(self.config.PRINT_FEATURE_EXTRACTOR, self.config.PRINT_EXCEPTION_LEVEL,
59 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__ )
60 |
61 | self.filter_start_words = ["Fernruf:", "Vorstand:", "Fernschreiber:",
62 | "von","Gründung:", "Ordnungsnr.", "Ordnungsnr",
63 | "Grundkapital:","Umstellung"]
64 |
65 |
66 | def extract_file_features(self, ocromore_data):
67 | all_line_features = []
68 | for line in ocromore_data['lines']:
69 | current_line_features = self.extract_line_features(line)
70 | all_line_features.append(current_line_features)
71 |
72 | ocromore_data['line_features'] = all_line_features
73 |
74 | return ocromore_data
75 |
76 |
77 | def extract_line_features(self, line):
78 |
79 | final_line_features = {}
80 |
81 | whole_text = line['text']
82 |
83 | self.cpr.print("recognizing text:", whole_text)
84 |
85 | # counters
86 | counter_special_chars = 0
87 | counter_alphanumerical_chars = 0
88 | counter_numbers = 0
89 | counter_chars = len(whole_text)
90 | counter_alphabetical = 0
91 | counter_words = 0
92 | counters_alphabetical_ratios = []
93 | counters_wordlengths = []
94 | counters_numbers = []
95 |
96 | character_index = 0
97 | # special conditions
98 | ultimo_is_first_word = False
99 | first_word_no_table_indicator = False
100 | starts_with_parenthesis = False
101 | ends_with_parenthesis = False
102 |
103 | last_xstop = 0
104 | x_box_sizes = []
105 | x_gaps = []
106 | for word_obj in line['words']:
107 | word_index = word_obj['word_index']
108 | word_text = word_obj['text']
109 | hocr_coordinates = word_obj['hocr_coordinates']
110 |
111 | word_xstart = hocr_coordinates[0]
112 | word_xstop = hocr_coordinates[2]
113 | word_box_size = word_xstop - word_xstart
114 | x_box_sizes.append(word_box_size)
115 |
116 | if word_index >= 1:
117 | x_gap = word_xstop - last_xstop
118 | x_gaps.append(x_gap)
119 |
120 | #line.data['word_x0']
121 | if word_text is None or word_text == "":
122 | continue
123 |
124 | if word_index == 0:
125 | if word_text in self.filter_start_words:
126 | first_word_no_table_indicator = True
127 | if word_text.lower() == "ultimo":
128 | ultimo_is_first_word = True
129 | if word_text[0] == "(":
130 | starts_with_parenthesis = True
131 |
132 |
133 | if word_index == len(whole_text)-1:
134 | if word_text[-1] == ")":
135 | ends_with_parenthesis = True
136 |
137 |
138 |
139 | counter_alphabetical_chars_word = 0
140 | counter_alphanumerical_chars_word = 0
141 | counter_numbers_word = 0
142 |
143 |
144 | counter_words += 1
145 |
146 | word_list = list(word_text)
147 | for char in word_list:
148 | if Random.is_special_character(char):
149 | counter_special_chars += 1
150 | elif Random.is_alphanumerical_character(char):
151 | counter_alphanumerical_chars += 1
152 | counter_alphanumerical_chars_word += 1
153 | if char.isdigit():
154 | counter_numbers += 1
155 | counter_numbers_word += 1
156 |
157 | counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word
158 | ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word_text), 2)
159 | counters_alphabetical_ratios.append(ratio_alphabetical_word)
160 | counters_wordlengths.append(len(word_text))
161 | counters_numbers.append(counter_numbers_word)
162 | character_index += len(word_text)
163 | last_xstop = word_xstop
164 |
165 |
166 | # get number of spaces
167 | len_whole_unspace = len(whole_text.replace(" ", ""))
168 | counter_spaces = counter_chars - len_whole_unspace
169 | # set alphabetical counter
170 | counter_alphabetical = counter_alphanumerical_chars - counter_numbers
171 |
172 |
173 | if counter_chars == 0:
174 | self.cpr.printw("no chars in line:", str(line['line_index']),"no features here")
175 | return False
176 |
177 | special_chars_ratio = counter_special_chars/ counter_chars
178 | alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars
179 | alphabetical_ratio = counter_alphabetical / counter_chars
180 | spaces_ratio = counter_spaces/ counter_chars
181 | numbers_ratio = counter_numbers / counter_chars
182 |
183 |
184 | maximum_x_gap = None
185 | mean_x_gap = None
186 | median_x_gap = None
187 |
188 | if len(x_gaps) >= 1:
189 | maximum_x_gap = max(x_gaps)
190 | mean_x_gap = np.mean(x_gaps)
191 | median_x_gap = np.median(x_gaps)
192 |
193 | many_numbers_in_first_word = False
194 | many_alphabetical_in_middle_words = False
195 | many_alphabetical_in_last_word = False
196 |
197 | # check some middle and last word conditions
198 | for counter_index, counter in enumerate(counters_wordlengths):
199 | if counter_index == 0:
200 | ctr_numbers = counters_numbers[counter_index]
201 | numbers_ratio_word = np.round(ctr_numbers/counter,2)
202 | if numbers_ratio_word > 0.8:
203 | many_numbers_in_first_word = True
204 | elif counter_index == len(counters_wordlengths)-1:
205 | if counter >= 4:
206 | alphabetical_ratio_word = counters_alphabetical_ratios[counter_index]
207 | if alphabetical_ratio_word >= 0.75:
208 | many_alphabetical_in_last_word = True
209 |
210 | else:
211 | if counter >= 4:
212 | alphabetical_ratio_word = counters_alphabetical_ratios[counter_index]
213 | if alphabetical_ratio_word >= 0.75:
214 | many_alphabetical_in_middle_words = True
215 |
216 |
217 |
218 |
219 |
220 | final_line_features = LineFeatures(cpr=self.cpr)
221 | final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word
222 |
223 | final_line_features.counter_special_chars = counter_special_chars
224 | final_line_features.counter_chars = counter_chars
225 | final_line_features.counter_spaces = counter_spaces
226 | final_line_features.counter_numbers = counter_numbers
227 | final_line_features.counter_alphabetical = counter_alphabetical
228 | final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars
229 | final_line_features.counter_words = counter_words
230 |
231 | final_line_features.counters_numbers = counters_numbers
232 | final_line_features.counters_wordlengths = counters_wordlengths
233 | final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios
234 |
235 | final_line_features.numbers_ratio = numbers_ratio
236 | final_line_features.alphabetical_ratio = alphabetical_ratio
237 | final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio
238 | final_line_features.special_chars_ratio = special_chars_ratio
239 | final_line_features.spaces_ratio = spaces_ratio
240 |
241 | final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word
242 | final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words
243 | final_line_features.many_numbers_in_first_word = many_numbers_in_first_word
244 | final_line_features.x_box_sizes = x_box_sizes
245 | final_line_features.x_gaps = x_gaps
246 |
247 | final_line_features.maximum_x_gap = maximum_x_gap
248 | final_line_features.mean_x_gap = mean_x_gap
249 | final_line_features.median_x_gap = median_x_gap
250 |
251 |
252 |
253 | return final_line_features
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2017 Universitätsbibliothek Mannheim
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/lib/segment_parser.py:
--------------------------------------------------------------------------------
1 | from akf_corelib.conditional_print import ConditionalPrint
2 | from akf_corelib.configuration_handler import ConfigurationHandler
3 | from .akf_parsing_functions_one import AkfParsingFunctionsOne
4 | from .akf_parsing_functions_two import AkfParsingFunctionsTwo
5 | from .akf_parsing_functions_three import AkfParsingFunctionsThree
6 | from .akf_parsing_functions_jk import AkfParsingFunctionsJK
7 |
8 | from .akf_parsing_functions_tables_one import AkfParsingFunctionsTablesOne
9 |
10 | from .data_helper import DataHelper
11 | from .segment_parser_endobject_factory import EndobjectFactory
12 | from lib.data_helper import DataHelper as dh
13 | from lib.snippet_ocr import Snippet
14 | import glob
15 | import os
16 |
17 |
18 | class FunctionMapAKF(object):
19 | """
20 | This is a holder class which maps segment
21 | tags to parsing functions (here for AKF-Projekt)
22 | can be swapped for other projects
23 | """
24 |
25 | def __init__(self, endobject_factory, output_analyzer, dictionary_handler):
26 | self.ef = endobject_factory
27 | self.akf_one = AkfParsingFunctionsOne(endobject_factory, output_analyzer, dictionary_handler)
28 | self.akf_two = AkfParsingFunctionsTwo(endobject_factory, output_analyzer, dictionary_handler)
29 | self.akf_three = AkfParsingFunctionsThree(endobject_factory, output_analyzer, dictionary_handler)
30 | self.akf_jk = AkfParsingFunctionsJK(endobject_factory, output_analyzer, dictionary_handler)
31 |
32 | self.akf_tables_one = AkfParsingFunctionsTablesOne(endobject_factory, output_analyzer, dictionary_handler)
33 |
34 | # for the keys use the keys from 'akf_segment_holder' or similar
35 |
36 | self.function_map = {
37 | "Firmenname": self.akf_one.parse_firmenname,
38 | "Sitz": self.akf_one.parse_sitz,
39 | "Verwaltung": self.akf_one.parse_verwaltung,
40 | "Telefon/Fernruf": self.akf_one.parse_telefon_fernruf,
41 | "Vorstand": self.akf_one.parse_vorstand,
42 | "Aufsichtsrat": self.akf_one.parse_aufsichtsrat,
43 | "Gründung": self.akf_one.parse_gruendung,
44 | "Arbeitnehmervertreter": self.akf_one.parse_arbeitnehmervertreter,
45 | "Tätigkeitsgebiet": self.akf_one.parse_taetigkeitsgebiet,
46 | "Zahlstellen": self.akf_two.parse_zahlstellen,
47 | "Grundkapital": self.akf_two.parse_grundkapital,
48 | "OrdnungsNrAktien": self.akf_two.parse_ordnungsnrdaktien,
49 | "Großaktionär": self.akf_two.parse_grossaktionaer,
50 | "Geschäftsjahr": self.akf_two.parse_geschaeftsjahr,
51 | "StimmrechtAktien": self.akf_two.parse_stimmrechtaktien,
52 | "Börsennotiz": self.akf_two.parse_boersennotiz,
53 | "Stückelung": self.akf_two.parse_stueckelung,
54 | "Aktienkurse": self.akf_jk.parse_aktienkurse,
55 | "Dividenden": self.akf_jk.parse_dividend, # is table
56 | "DividendenAufXYaktien": self.akf_jk.parse_dividend, # is table
57 | "BeratendeMitglieder": self.akf_three.parse_beratende_mitglieder,
58 | "Gesellschafter": self.akf_three.parse_gesellschafter,# not in first 500 files 1956??
59 | "Sekretäre": self.akf_three.parse_sekretaere, # not in first 500 files 1956??
60 | "Geschäftsleitung": self.akf_three.parse_geschaeftsleitung, # not in first 500 files 1956??
61 | "Generaldirektion": self.akf_three.parse_generaldirektion, # not in first 500 files 1956??
62 | "Direktionskomitee": self.akf_three.parse_direktionskomitee, # not in first 500 files 1956??
63 | "Vizegeneraldirektoren": self.akf_three.parse_vizegeneraldirektoren, # not in first 500 files 1956??
64 | "Fernschreiber": self.akf_three.parse_fernschreiber,
65 | "Filialen": self.akf_three.parse_filialen, # not a category in 1956 -> #todo maybe use later
66 | "Auslandsvertretungen": self.akf_three.parse_auslandsvertretungen, # not a category in 1956 -> #todo maybe use later
67 | "KommanditeUndBank": self.akf_three.parse_kommandite_und_bank, # not a category in 1956 -> #todo maybe use later
68 | "Niederlassungen": self.akf_three.parse_niederlassungen,
69 | "Erzeugnisse": self.akf_three.parse_erzeugnisse,
70 | "Haupterzeugnisse": self.akf_three.parse_haupterzeugnisse,
71 | "Spezialitäten": self.akf_three.parse_spezialitaeten,
72 | "Anlagen": self.akf_three.parse_anlagen,
73 | "Zweigniederlassungen": self.akf_three.parse_zweigniederlassungen,
74 | "Werke/Betriebsstätten": self.akf_three.parse_werke_betriebsstaetten,
75 | "Betriebsanlagen": self.akf_three.parse_betriebsanlagen,
76 | "Beteiligungsgesellschaften": self.akf_three.parse_beteiligungsgesellschaften, # not a category in 1956 -> #todo maybe use later
77 | "Beteiligungen": self.akf_three.parse_beteiligungen,
78 | "Tochtergesellschaften": self.akf_three.parse_tochtergesellschaften,
79 | "Wertpapier-Kenn-Nr": self.akf_three.parse_wertpapier_kenn_nr, # not a category in 1956 -> #todo maybe use later
80 | "RechteVorzugsaktien": self.akf_three.parse_rechte_und_vorzugsaktien,
81 | "Aktionäre": self.akf_three.parse_aktionaere,
82 | "Anleihen": self.akf_three.parse_anleihen,
83 | "KursVonZuteilungsrechten": self.akf_three.parse_kurse_v_zuteilungsrechten,
84 | "Emissionsbetrag": self.akf_three.parse_emissionsbetrag,
85 | "AusDenKonsolidiertenBilanzen": self.akf_jk.parse_bilanzen, # table
86 | "AusDenBilanzen": self.akf_jk.parse_bilanzen, # table
87 | "Konsolid.Gewinn-u.Verlustrechnungen": self.akf_jk.parse_gewinn_und_verlust, # table
88 | "AusGewinnVerlustrechnungen": self.akf_jk.parse_gewinn_und_verlust, # @jk last element works now
89 | "Bezugsrechte": self.akf_three.parse_bezugsrechte,
90 | "ZurGeschäftslage": self.akf_three.parse_geschaeftslage
91 | }
92 |
93 | def get_function_map(self):
94 | return self.function_map
95 |
96 |
97 |
98 |
99 | class SegmentParser(object):
100 | """
101 | Parse the classified segments segment by segment,
102 | each segment defined code the parser points to.
103 | """
104 |
105 | def __init__(self, output_analyzer, dictionary_handler, ocromore_data=None):
106 |
107 | self.ef = EndobjectFactory()
108 | self.dictionary_handler = dictionary_handler
109 |
110 | # map which maps tags to functions for parsing -> change constuctor for other project
111 | fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler)
112 |
113 | config_handler = ConfigurationHandler(first_init=False)
114 |
115 | self.config = config_handler.get_config()
116 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER, self.config.PRINT_EXCEPTION_LEVEL,
117 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
118 |
119 | self.function_map = fmap.get_function_map()
120 | self.result_root = self.config.OUTPUT_ROOT_PATH + "/results/"
121 |
122 | def clear_result(self, output_analyzer, dictionary_handler, ocromore_data=None):
123 | # create a new end object factory, new content
124 | self.ef = EndobjectFactory()
125 | # map to the new ef object which has been recreated
126 | fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler)
127 | self.function_map = fmap.get_function_map()
128 |
129 |
130 | def parse_segments(self, ocromore_data):
131 | self.ocromore_data = ocromore_data
132 | segmentation = ocromore_data['segmentation']
133 | segmentation_classes = segmentation.my_classes
134 |
135 | # add all text from original file if activated (i.e. for debugging purposes)
136 | if self.config.ADD_FULLTEXT_ENTRY:
137 | all_texts = self.get_all_text(ocromore_data)
138 | self.ef.set_current_main_list("overall_info")
139 | self.ef.add_to_my_obj("fulltexts",all_texts)
140 | # add additional info to result
141 | if self.config.ADDITIONAL_INFORMATION and self.config.ADD_ADDITIONAL_INFO:
142 | if not self.config.ADD_FULLTEXT_ENTRY:
143 | self.ef.set_current_main_list("Information")
144 | self.ef.add_to_my_obj("additionals", ocromore_data["additional_info"])
145 | # add a duplicate of the original text from which in the below analysis case the files get subtracted
146 | if self.config.LOG_SEGMENTED_TO_ORIG_DIFF_PER_FILE:
147 | if self.config.ADD_FULLTEXT_ENTRY:
148 | ocromore_data['analysis_to_orig'] = {}
149 | original_rest, complete_text = self.get_all_text(ocromore_data, join_separated_lines=True)
150 | ocromore_data['analysis_to_orig']['original_rest'] = original_rest
151 | ocromore_data['analysis_to_orig']['original_length_initial'] = len(complete_text)
152 | else:
153 | self.cpr.printw("activated segment to orig diff, but no saving of origin activate ADD_FULLTEXT_ENTRY "
154 | "in config for this functionality")
155 |
156 |
157 |
158 | #Init toolbbox
159 | snippet = None
160 | if self.config.USE_SNIPPET:
161 | if "./" in self.config.IMGPATH:
162 | ipath = os.path.dirname(ocromore_data["file_info"].path)+self.config.IMGPATH[1:]
163 | else:
164 | ipath = os.path.normcase(self.config.IMGPATH)
165 | results = glob.glob(ipath+ocromore_data["file_info"].name.split(".")[0].replace("_msa_best","")+"*",recursive=True)
166 | if results:
167 | snippet = Snippet()
168 | snippet.imread(results[0])
169 | else:
170 | self.config.USE_TOOLBBOX = False
171 | info_handler = {}
172 | # start parsing for each successfully segmented area
173 | for segmentation_class in segmentation_classes:
174 |
175 | # if the class segment was recognized ...
176 | if segmentation_class.is_start_segmented():
177 | # get the unique identifier for this class
178 | segment_tag = segmentation_class.get_segment_tag()
179 | segmentation_class.snippet = snippet
180 | segmentation_class.info_handler = info_handler
181 | self.trigger_mapped_function(segment_tag, segmentation_class, ocromore_data)
182 |
183 |
184 | # add and return result
185 | ocromore_data['results'] = self.ef
186 | return ocromore_data
187 |
188 | def trigger_mapped_function(self, segment_tag, segmentation_class, ocromore_data):
189 |
190 | if segment_tag not in self.function_map.keys():
191 | return
192 | #todo: fileinfo -> parsing
193 | real_start_tag, content_texts, content_lines, feature_lines = self.prepare_parsing_info(segmentation_class, ocromore_data)
194 |
195 | # switch the object to save context
196 | segment_tag = segmentation_class.segment_tag
197 | self.ef.set_current_main_list(segment_tag)
198 |
199 | # call the mapped function, which fills the end-factory
200 | self.function_map[segment_tag].__call__(real_start_tag, content_texts, content_lines, feature_lines, segmentation_class)
201 |
202 | def prepare_parsing_info(self, segmentation_class, ocromore_data):
203 | lines = ocromore_data['lines']
204 | line_features = ocromore_data['line_features']
205 | real_start_tag, content_texts, content_lines, feature_lines = \
206 | DataHelper.get_content(lines,line_features, segmentation_class)
207 |
208 | return real_start_tag, content_texts, content_lines, feature_lines
209 |
210 | def get_all_text(self, ocromore_data, join_separated_lines=False):
211 | """
212 | Gets all text lines in ocromore_data as
213 | array and as joined string
214 | :param ocromore_data: data from which the text is extracted
215 | :return: texts list, complete text
216 | """
217 | all_texts = []
218 | complete_text = ""
219 | for line in ocromore_data['lines']:
220 | text = line['text']
221 | all_texts.append(text)
222 | complete_text += text
223 |
224 | if join_separated_lines:
225 | complete_text = ""
226 | all_texts = dh.join_separated_lines(all_texts)
227 | for text in all_texts:
228 | complete_text += text
229 |
230 | return all_texts, complete_text
231 |
232 | def write_result_to_output(self, as_json, ocromore_data):
233 | if as_json is True:
234 | my_json = self.ef.export_as_json()
235 | my_json_lines = my_json.split("\n")
236 | dh.write_array_to_root("result_json/", my_json_lines, ocromore_data, self.result_root)
--------------------------------------------------------------------------------
/lib/segment_parser_endobject_factory.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pprint
3 | from akf_corelib.conditional_print import ConditionalPrint
4 | from akf_corelib.configuration_handler import ConfigurationHandler
5 | from lib.akf_known_uncategories import KnownUncategories
6 |
7 | class EndobjectFactory(object):
8 | """
9 | Creates an object with the following structure and provides exporting methods:
10 |
11 | segment_tag_1: [ ---> this level is created by set_current_main_list
12 | {
13 | type: "Sitz" ---> add this level entries with add_to_my_object object_number=0
14 | city: "Neustadt"
15 | },
16 | {
17 | type: "Sitz" ---> add this level entries with add_to_my_object object_number=0
18 | city: "Neustadt"
19 | }
20 |
21 | ],
22 | segment_tag_2: [
23 | {
24 | ...
25 | }
26 | ...
27 | ]
28 | """
29 | def __init__(self):
30 | self.my_object = {}
31 | self.current_main_list = None
32 | self.pp = pprint.PrettyPrinter(indent=5)
33 |
34 | config_handler = ConfigurationHandler(first_init=False)
35 |
36 | self.config = config_handler.get_config()
37 | self.cpr = ConditionalPrint(self.config.PRINT_OUTPUT_ANALYSIS, self.config.PRINT_EXCEPTION_LEVEL,
38 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
39 |
40 | if self.config.REMOVE_TAGS_IN_ORIG_DIFF:
41 | self.known_uc = KnownUncategories()
42 |
43 | def set_current_main_list(self, segment_tag):
44 | if segment_tag not in self.my_object.keys():
45 | self.my_object[segment_tag] = [] # create the main list (all subsequent entries are stored here)
46 |
47 | self.current_main_list = self.my_object[segment_tag] # create a short link on the main list
48 |
49 | def add_to_my_obj(self, key, value, object_number=0, only_filled=False):
50 |
51 | if only_filled is True and (value == None or value == "" or value == [] or value == {}):
52 | return False
53 |
54 | # fill main list if object index not in
55 | len_list = len(self.current_main_list)
56 | if len_list < object_number+1:
57 | for index in range(len_list,object_number+1):
58 | self.current_main_list.append({})
59 |
60 | self.cpr.print("Adding value to List,- ObjectNr.:", object_number,"Key:", key, "Value:", value)
61 | # add or insert to the main_list
62 | self.current_main_list[object_number][key] = value
63 | return True
64 |
65 | def print_me_and_return(self):
66 | print("my_object is:")
67 | self.pp.pprint(self.my_object)
68 | return self.my_object
69 |
70 | def print_current_main(self):
71 | print("current_main:")
72 | self.pp.pprint(self.current_main_list)
73 |
74 | def export_as_json(self):
75 | my_obj_json = json.dumps(self.my_object, indent=5, ensure_ascii=False)
76 | return my_obj_json
77 |
78 | def export_as_json_at_key(self, key, remove_first_object=False):
79 |
80 | if key not in self.my_object.keys():
81 | return None
82 |
83 | my_obj = self.my_object[key]
84 | if remove_first_object:
85 | if len(my_obj) >= 1:
86 | my_obj = my_obj[1:] # remove the first object which usally contains generic info
87 |
88 | my_obj_json = json.dumps(my_obj, indent=5, ensure_ascii=False)
89 | return my_obj_json
90 |
91 | @staticmethod
92 | def fetch_subentries_recursive_check(entry):
93 | """
94 | Fetches all subentries (values) from an entry and writes them to a list of texts
95 | This get's called recursively within the function until all subentries
96 | are found
97 | :param entry: entry to fetch the subentries from
98 | :return: list of subentries
99 | """
100 | final_texts = []
101 |
102 | for item in entry:
103 | if isinstance(entry, list):
104 | value = item
105 | else:
106 | # item is a key
107 | value = entry[item]
108 | if isinstance(value, str):
109 | final_texts.append(value)
110 | elif isinstance(value, int):
111 | final_texts.append(str(value))
112 | elif isinstance(value, object):
113 | obj_size = len(value)
114 | if obj_size > 0:
115 | recursive_texts = EndobjectFactory.fetch_subentries_recursive_check(value)
116 | final_texts.extend(recursive_texts)
117 |
118 | return final_texts
119 |
120 | @staticmethod
121 | def fetch_keys_recusive_check(entry, final_keys, create_multiple=True):
122 | """
123 | Fetches all keys in an object and it's sub-objects
124 | calls itself recursively until all keys are found
125 | writes final keys to final_keys array and returns this
126 | :param entry: object to fetch the sub-keys from
127 | :param final_keys: list of final keys (initial state)
128 | :param create_multiple: if the same key occurs multiple times it still gets added
129 | :return: final_keys with added keys from object
130 | """
131 |
132 | if isinstance(entry, list):
133 | for item in entry:
134 | final_keys = EndobjectFactory.fetch_keys_recusive_check(item, final_keys, create_multiple)
135 | return final_keys
136 | elif not isinstance(entry, dict):
137 | # just return if there are no keys (cause no dictionary)
138 | return final_keys
139 |
140 | for key in entry:
141 | value = entry[key]
142 | if create_multiple or key not in final_keys:
143 | if isinstance(key, int):
144 | continue
145 | final_keys.append(key)
146 | final_keys = EndobjectFactory.fetch_keys_recusive_check(value, final_keys)
147 | return final_keys
148 |
149 | def diff_seg_to_orig_at_key(self, key):
150 | """
151 | def fetch_subentries_recursive(entry):
152 | final_texts = []
153 |
154 | for item in entry:
155 | if isinstance(entry, list):
156 | value = item
157 | else:
158 | # item is a key
159 | value = entry[item]
160 | if isinstance(value, str):
161 | final_texts.append(value)
162 | elif isinstance(value, int):
163 | final_texts.append(str(value))
164 | elif isinstance(value, object):
165 | obj_size = len(value)
166 | if obj_size > 0:
167 | recursive_texts = fetch_subentries_recursive(value)
168 | final_texts.extend(recursive_texts)
169 |
170 | return final_texts
171 | """
172 | if key not in self.my_object.keys():
173 | return None
174 |
175 | my_data = self.my_object[key]
176 |
177 | # check if the orig-post property can exist warn if not
178 | if not self.config.ADD_INFO_ENTRY_TO_OUTPUT:
179 | self.cpr.printw("trying to fetch original data, original data is not added to results")
180 | self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True")
181 | if len(my_data) <= 0:
182 | self.cpr.printw("no data to do returning")
183 | return
184 |
185 | return # todo this seems to be wrong
186 | # copy orig string
187 | original_text = my_data[0]['origpost']
188 | rest_text = original_text
189 |
190 | # fetch parsed entries for diff
191 | all_final_entries = [] # array of final entries
192 | for index in range(1, len(my_data)):
193 | entry = my_data[index]
194 | final_entries = fetch_subentries_recursive(entry)
195 | all_final_entries.extend(final_entries)
196 |
197 | # order diff data after length
198 | all_final_entries.sort(key=lambda x: len(x))
199 | all_final_entries.reverse()
200 |
201 | # subtract
202 | for text in all_final_entries:
203 | rest_text = rest_text.replace(text, "")
204 |
205 | rest_text = rest_text.strip()
206 |
207 | return rest_text, original_text
208 |
209 | def diff_parsed_to_orig_at_key(self, key):
210 | """
211 | def fetch_subentries_recursive(entry):
212 | final_texts = []
213 |
214 | for item in entry:
215 | if isinstance(entry, list):
216 | value = item
217 | else:
218 | # item is a key
219 | value = entry[item]
220 | if isinstance(value, str):
221 | final_texts.append(value)
222 | elif isinstance(value, int):
223 | final_texts.append(str(value))
224 | elif isinstance(value, object):
225 | obj_size = len(value)
226 | if obj_size > 0:
227 | recursive_texts = fetch_subentries_recursive(value)
228 | final_texts.extend(recursive_texts)
229 |
230 | return final_texts
231 |
232 | def fetch_keys_recusive(entry, final_keys, create_multiple=True):
233 | # just return if there are no keys (cause no dictionary)
234 | if not isinstance(entry, dict):
235 | return final_keys
236 |
237 | for key in entry:
238 | value = entry[key]
239 | if create_multiple or key not in final_keys:
240 | if isinstance(key, int):
241 | continue
242 | final_keys.append(key)
243 | final_keys = fetch_keys_recusive(value, final_keys)
244 | return final_keys
245 | """
246 | if key not in self.my_object.keys():
247 | return None
248 |
249 | #if key == "KursVonZuteilungsrechten":
250 | # print("todo remove debug")
251 |
252 | my_data = self.my_object[key]
253 |
254 | # check if the orig-post property can exist warn if not
255 | if not self.config.ADD_INFO_ENTRY_TO_OUTPUT:
256 | self.cpr.printw("trying to fetch original data, original data is not added to results")
257 | self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True")
258 | if len(my_data) <= 0:
259 | self.cpr.printw("no data to do returning")
260 | return
261 | # copy orig string
262 | original_text = my_data[0]['origpost']
263 | rest_text = original_text
264 |
265 | # fetch parsed entries for diff
266 | pool_entries = [] # array of final entries
267 | for index in range(1, len(my_data)):
268 | entry = my_data[index]
269 | final_entries = EndobjectFactory.fetch_subentries_recursive_check(entry)
270 | pool_entries.extend(final_entries)
271 |
272 | if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True:
273 | # removes all spaces from rest and comparison values because spaces are often
274 | # a problem in subtracting the rests
275 | rest_text = rest_text.replace(" ", "")
276 | for index in range(0,len(pool_entries)):
277 | pool_entries[index] = pool_entries[index].replace(" ", "")
278 |
279 | all_final_entries = []
280 |
281 | # add the entries to the complete subtraction and tag them with '1'
282 | for pentry in pool_entries:
283 | all_final_entries.append((pentry, 1))
284 |
285 | # if keys shall be subracted also add them also
286 | if self.config.REMOVE_TAGS_IN_ORIG_DIFF:
287 | pool_keys = [] # gets multiple of the same key for later 1 by 1 subtraction
288 | for index in range(1, len(my_data)):
289 | pool_keys = EndobjectFactory.fetch_keys_recusive_check(my_data[index], pool_keys, create_multiple=True)
290 |
291 | # also remove spaces in keys
292 | if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True:
293 | for index in range(0, len(pool_keys)):
294 | pool_keys[index] = pool_keys[index].replace(" ", "")
295 |
296 | final_keys = []
297 | for pkey in pool_keys:
298 | final_keys.append((pkey, 2))
299 |
300 | all_final_entries.extend(final_keys)
301 |
302 | # order diff data after length
303 | all_final_entries.sort(key=lambda x: len(x[0]))
304 | all_final_entries.reverse()
305 |
306 | # subtract
307 | for entry in all_final_entries:
308 | text = entry[0]
309 | text_or_key = entry[1]
310 | if text_or_key == 2:
311 | if text in self.known_uc.unkeys:
312 | continue
313 | text_stripped = text.strip() # remove spaces so texts better fit in
314 | rest_text = rest_text.replace(text_stripped, "", 1)
315 | rest_text = rest_text.strip()
316 |
317 | return rest_text, original_text
318 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | ============
3 | 
4 |
5 | Docxstruct parses .hocr-output of [ocromore][ocromore-link] to get a content-classified .json output
6 | for further database export. It is part of the [Aktienführer-Datenarchiv work process][akf-link],
7 | but can also be used independently.
8 |
9 | # Installation
10 |
11 | To initialize the git submodules (~git version 2.7.4):
12 |
13 | `
14 | git submodule update --init --recursive
15 | `
16 |
17 | For development Pycharm IDE 2017.3 Community Edition was used
18 |
19 |
20 | If using the Pycharm IDE to look at accumulated segmentation analysis files adapt the IDE settings to have proper view.
21 | This is in idea.properties-file which can i.e. be found over Help->Edit Custom Properties in Pycharm:
22 |
23 |
24 | `
25 | editor.soft.wrap.force.limit=10000
26 | `
27 |
28 |
29 | # Handling Code
30 | `Docxstruct` is made to be adapted for parsing other kinds of content
31 | than *Aktienführer data*. It can be used as generic text-content recognizer and classifier.
32 | Therefore it provides lot's of analysis and structure for that.
33 |
34 | Usually all akf-specific content is stored in files which are called *akf_XXX*
35 | this are the parts where you might want to put your custom functionalities.
36 |
37 | Ways how to do that are described in the following documentation parts.
38 |
39 | # input output example
40 | In an example with Aktienführer data io is explained. This is the basic input which is usually in a hocr file.
41 | ```
42 | OCR Results
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 | Überlandwerk
54 | Unterfranken
55 | Aktiengesellschatt
56 |
57 |
= 1 and text[len_text-1] == ":":
31 | text = text[0:len_text-1]
32 |
33 | return text
34 |
35 |
36 | @staticmethod
37 | def get_rest_content_start_line(segmentation_class, start_line, trim=True):
38 | text = start_line['text']
39 | stop = segmentation_class.key_tag_cindex_stop
40 | rest_start = text[stop:]
41 | if trim:
42 | rest_start = rest_start.strip()
43 | return rest_start
44 |
45 | @staticmethod
46 | def remove_multiple_outbound_chars(text):
47 | """
48 | Strips the left and the right side of special characters in a string
49 | and returns the stripped version then:
50 | example ".;my text is;,,," returns "my text is"
51 | :param text: input text
52 | :return: filtered text
53 | """
54 | # print("input:", text)
55 |
56 | text_to_change = text
57 |
58 | # filter left side
59 | match_l = regex.search(r"^[^\w\s]*(?.*)", text_to_change)
60 | if match_l:
61 | rest = match_l.group("tag")
62 | text_to_change = rest
63 |
64 | if text_to_change == "":
65 | return text_to_change
66 |
67 | # filter right side
68 | match_r2 = regex.search(r"(?P[^\w\s]*)$", text_to_change)
69 |
70 | if match_r2:
71 | rest = match_r2.group("right_rest")
72 | text_to_change = DataHelper.rreplace(text_to_change, rest)
73 |
74 | # print("output:", text_to_change)
75 | return text_to_change
76 |
77 | @staticmethod
78 | def rreplace(text, replace_text):
79 | """
80 | Replace text from the right hand side of a string
81 | by reversing the strings
82 | :param text: input text
83 | :return: filtered text
84 | """
85 | reverse_text = text[::-1]
86 | reverse_replace_text = replace_text[::-1]
87 | new_reverse_text = reverse_text.replace(reverse_replace_text, "")
88 | new_text = new_reverse_text[::-1].strip()
89 |
90 | return new_text
91 |
92 |
93 | @staticmethod
94 | def get_content(segment_lines, feature_lines, segmentation_class):
95 | start_index = segmentation_class.get_start_line_index()
96 | stop_index = segmentation_class.get_stop_line_index()
97 | selected_start_line = segment_lines[start_index]
98 | feature_start_line = feature_lines[start_index]
99 | real_tag = DataHelper.get_real_tag_from_segment(segmentation_class, selected_start_line)
100 | rest_content_start_line = DataHelper.get_rest_content_start_line(segmentation_class, selected_start_line)
101 |
102 | # if there are no further line, return obtained content
103 | if start_index == stop_index:
104 | return real_tag, [rest_content_start_line], [selected_start_line], [feature_start_line]
105 |
106 | # otherwise fetch the rest of the content
107 | other_rest_content_texts = []
108 | other_rest_content_lines = []
109 | other_rest_feature_lines = []
110 |
111 | other_rest_content_texts.append(rest_content_start_line)
112 | other_rest_content_lines.append(selected_start_line)
113 | other_rest_feature_lines.append(feature_start_line)
114 |
115 | for current_index in range(start_index+1, stop_index+1):
116 | current_line = segment_lines[current_index]
117 | current_feature_lines = feature_lines[current_index]
118 | other_rest_content_texts.append(current_line['text'])
119 | other_rest_content_lines.append(current_line)
120 | other_rest_feature_lines.append(current_feature_lines)
121 |
122 | return real_tag, other_rest_content_texts, other_rest_content_lines, other_rest_feature_lines
123 |
124 |
125 | @staticmethod
126 | def write_array_to_root_simple(base_path, tag, text_lines, analysis_root, append_mode=False):
127 | full_dir = analysis_root + base_path + "/"
128 | full_path = full_dir + tag + ".txt"
129 |
130 | fh.create_directory_tree(full_dir)
131 | # write append or normal
132 | if append_mode is True:
133 | my_file = io.open(full_path, 'a', encoding='utf8')
134 | else:
135 | my_file = io.open(full_path, 'w', encoding='utf8')
136 |
137 | for text_line in text_lines:
138 | my_file.write(text_line+"\n")
139 |
140 | my_file.close()
141 |
142 | @staticmethod
143 | def write_array_to_root(base_path, text_lines, ocromore_data, analysis_root, accumulated=False):
144 | """
145 | Writes a line-array to the base path in root path with ocromore data file and db name
146 | :param base_path:
147 | :param text_lines:
148 | :param ocromore_data:
149 | :param analysis_root: root path in base directory
150 | :param accumulated: file is accumulated file naming different
151 | :return:
152 | """
153 |
154 | dbpath = ocromore_data['file_info'].dbpath
155 | tablename = ocromore_data['file_info'].tablename
156 |
157 | full_dir = analysis_root + base_path + dbpath+"/"
158 | if accumulated is False:
159 | full_path = full_dir + tablename + ".txt"
160 | else:
161 | full_path = full_dir +"accumulated_report"+".txt"
162 |
163 | fh.create_directory_tree(full_dir)
164 |
165 | my_file = io.open(full_path, 'w', encoding='utf8')
166 |
167 | for text_line in text_lines:
168 | my_file.write(text_line+"\n")
169 |
170 | my_file.close()
171 |
172 | @staticmethod
173 | def create_stringified_linearray(array_of_texts):
174 | final_string = ""
175 | for line_text in array_of_texts:
176 | final_string += line_text+"\n"
177 |
178 | final_string = final_string.strip()
179 | return final_string, final_string.replace("\n", " ")
180 |
181 | @staticmethod
182 | def strip_if_not_none(text, strip_pattern):
183 | if text is None:
184 | return text
185 | else:
186 | if strip_pattern != "":
187 | return text.strip(strip_pattern)
188 | else:
189 | return text.strip()
190 |
191 | @staticmethod
192 | def join_joined_lines(joined_texts, add_spaces=True):
193 | """
194 | Takes the output from 'join_separated_lines' and joins the lines to one
195 | string
196 | :param joined_texts: array of texts
197 | :param add_spaces: add a space between joined texts
198 | :return: joined string
199 | """
200 | return_text = ""
201 |
202 | for text in joined_texts:
203 | if add_spaces is True:
204 | return_text += " "+text
205 | else:
206 | return_text += text
207 |
208 | return_text = return_text.strip()
209 |
210 | return return_text
211 |
212 |
213 | @staticmethod
214 | def join_separated_lines(content_texts):
215 | """
216 | Joins dash separated lines in the text list (reduces the number of entries, if
217 | there are such lines)
218 | :param content_texts: text list to join
219 | :return: text array where all dash separated lines are joined
220 | """
221 |
222 | # final array with joined texts
223 | joined_texts = []
224 | # intermediate array for storing tagged lines (normal line:0 or separator_line:1)
225 | NORMAL_LINE = 0
226 | SEPARATOR_LINE = 1
227 | LAST_LINE = 2
228 |
229 | tagged_texts = []
230 |
231 | len_content_texts = len(content_texts)
232 |
233 | #if len_content_texts == 42:
234 | # print("asd")
235 |
236 | # iterate the given texts
237 | for text_index, text in enumerate(content_texts):
238 | if text is None:
239 | continue
240 | #if "Kommanditeinlagen" in text:
241 | # print("asd")
242 |
243 | # if there is one, get the follow up text
244 | next_text = None
245 | if text_index < len_content_texts - 1:
246 | next_text = content_texts[text_index + 1].strip()
247 |
248 | # detect line with separator
249 | if (len(text) >= 2 and "-" in text[-1]):
250 | line_ends_with_amount = False
251 |
252 | # this is a line which ends with a amount indicator like '6 500 000. -'
253 | # and therefore no separator
254 | if len(text) >= 3 and "-" in text[-1] and " " in text[-2] and "." in text[-3]:
255 | line_ends_with_amount = True
256 | elif len(text) >= 2 and "-" in text[-1] and "." in text[-2]:
257 | line_ends_with_amount = True
258 | elif len(text) >= 2 and "-" in text[-1] and text[-2].isdigit():
259 | line_ends_with_amount = True # no amount, but similar case it's a timespan '1996-\n1997' or similar
260 |
261 | if not line_ends_with_amount and next_text is not None and len(next_text) >= 1:
262 |
263 | # if the next starting letter is uppercase don't do the joining (assuming it's a '-'
264 | # separated Name like "Jan-Phillipp")
265 | if not next_text[0].isupper():
266 | # fetch the next text in current and remove separator
267 | text = text[0:len(text) - 1]
268 | # store in tagged texts
269 | tagged_texts.append((text, SEPARATOR_LINE))
270 | continue
271 |
272 | if text_index >= len_content_texts:
273 | tagged_texts.append((text, LAST_LINE))
274 | break
275 |
276 | # append to tagged texts
277 | tagged_texts.append((text, NORMAL_LINE))
278 |
279 | # join the tagged texts
280 |
281 | for current_index, ttext_info in enumerate(tagged_texts):
282 | if ttext_info == None:
283 | continue # line was already joined
284 |
285 | current_ttext, current_id = ttext_info
286 | if current_id == NORMAL_LINE:
287 | joined_texts.append(current_ttext)
288 | elif current_id == SEPARATOR_LINE:
289 | # check all follow up lines
290 | for follow_up_index in range(current_index+1, len(tagged_texts)):
291 | follow_ttext, follow_id = tagged_texts[follow_up_index]
292 | current_ttext = current_ttext + follow_ttext
293 | tagged_texts[follow_up_index] = None
294 | if follow_id == NORMAL_LINE or follow_id == LAST_LINE:
295 | # update my new array
296 | joined_texts.append(current_ttext)
297 | break # done escape the inner loop
298 | elif follow_id == SEPARATOR_LINE:
299 | continue # continue inner loop
300 |
301 | # return the modified list
302 | return joined_texts
303 |
304 | @staticmethod
305 | def join_separated_lines_parenthesis(content_texts):
306 | next_lines_is_ending_parenthesis = False # indicator -
307 | next_closing_ordinal = -1 # indicator - the n-th closing parenthesis closes the previous block
308 | change = False
309 | final_entries = []
310 |
311 | len_content_texts = len(content_texts)
312 | for text_index, text in enumerate(content_texts):
313 |
314 | # if there was a case detect add this line to the previous one instead of appending as new line
315 | if next_lines_is_ending_parenthesis:
316 |
317 | text_split = text.split(')')
318 | text_to_add = ""
319 | rest_text = ""
320 | # define next closing ordinal, sometimes overflow todo this is not 100% accurate
321 | used_closing_ordinal = 0
322 | if next_closing_ordinal > 0:
323 | used_closing_ordinal = next_closing_ordinal
324 | for tf_index, text_fragment in enumerate(text_split):
325 | if tf_index <= used_closing_ordinal:
326 | text_to_add += " " + text_fragment+")"
327 | else:
328 | if text_fragment.strip != "":
329 | # only add delimiters if not at end of split
330 | if tf_index == len(text_split)-1:
331 | rest_text += " " + text_fragment
332 | else:
333 | rest_text += " " + text_fragment+")"
334 |
335 | final_entries[-1] += " " + text_to_add.strip() # add until parenthesis end then go on
336 | next_lines_is_ending_parenthesis = False
337 | change = True # change debugging indicator
338 | # change current text to only rest
339 | text = rest_text.strip()
340 | #print(final_entries)
341 | if text == ")":
342 | continue
343 |
344 | # check if there is more opening parenthesis
345 | opening_parenthesis = text.count("(")
346 | closing_parenthesis = text.count(")")
347 |
348 | if opening_parenthesis <= closing_parenthesis:
349 | final_entries.append(text)
350 | continue
351 |
352 | # assign next text otherwise continue
353 | next_text = None
354 | if text_index+1 < len_content_texts:
355 | next_text = content_texts[text_index + 1]
356 | else:
357 | final_entries.append(text)
358 | continue
359 |
360 | next_opening_parentesis = next_text.count("(")
361 | next_closing_parenthesis = next_text.count(")")
362 |
363 | if next_closing_parenthesis == 0:
364 | final_entries.append(text)
365 | continue
366 |
367 | # if code ran until here the lines are a concat case
368 | final_entries.append(text)
369 | next_lines_is_ending_parenthesis = True
370 | next_closing_ordinal = opening_parenthesis-closing_parenthesis - next_closing_parenthesis
371 |
372 | #if change:
373 | # print("debug")
374 |
375 | return final_entries
376 |
377 | @staticmethod
378 | def filter_special_chars(text, remove_spaces=True):
379 | """
380 | Remove special characters from input text
381 | :param text: input text
382 | :param remove_spaces: if true also removes spaces
383 | :return: filtered text
384 | """
385 |
386 | if remove_spaces:
387 | text_filtered = re.sub('[^A-Za-z0-9]+', '', text)
388 | else:
389 | text_filtered = re.sub('[^A-Za-z0-9\s]+', '', text)
390 |
391 | return text_filtered
--------------------------------------------------------------------------------
/lib/segment_classifier.py:
--------------------------------------------------------------------------------
1 | from akf_corelib.conditional_print import ConditionalPrint
2 | from akf_corelib.configuration_handler import ConfigurationHandler
3 | from lib.akf_segment_holder import SegmentHolder
4 | from lib.data_helper import DataHelper as dh
5 | import inspect
6 |
7 | class SegmentClassifier(object):
8 | """
9 | This is the basic handler for classification
10 | which get's accessed from root/-outside classes.
11 | """
12 |
13 | def __init__(self):
14 |
15 | config_handler = ConfigurationHandler(first_init=False)
16 |
17 | self.config = config_handler.get_config()
18 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_CLASSIFIER, self.config.PRINT_EXCEPTION_LEVEL,
19 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
20 | self.cpr.print("init segment classifier")
21 |
22 | def classify_file_segments(self, ocromore_data):
23 | lines = ocromore_data['lines']
24 | feats = ocromore_data['line_features']
25 | file_info = ocromore_data['file_info']
26 | all_file_segments = AllSegments(len(lines), self.cpr, self.config)
27 |
28 | prev_line = None
29 | prev_text = None
30 | for current_line_index, current_line in enumerate(lines):
31 | current_features = feats[current_line_index]
32 | current_text = current_line['text']
33 | current_index = current_line['line_index']
34 | # create a combined lined object with optimized (removed) separation
35 | combined_line = None
36 | if prev_line is not None:
37 | combined_lines = dh.join_separated_lines([prev_text, current_text])
38 | combined_line = dh.join_joined_lines(combined_lines)
39 | else:
40 | combined_line = current_text
41 | # pass parameters to matching functions
42 | all_file_segments.match_my_segments(current_line, current_text, current_index, current_features,
43 | prev_line, combined_line)
44 | prev_line = current_line
45 | prev_text = current_text
46 |
47 |
48 |
49 |
50 | if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION:
51 | self.adapt_non_explicit_indices(all_file_segments)
52 | else:
53 | all_file_segments.correct_overlaps_index_field(only_start_tags=True)
54 |
55 | self.adapt_stop_index_in_last_segment(all_file_segments)
56 |
57 |
58 | # does the last steps in segment matching
59 | all_file_segments.finish_segment_matching(lines, feats, file_info)
60 |
61 | # do again after final step
62 | if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION:
63 | self.adapt_non_explicit_indices(all_file_segments)
64 | else:
65 | all_file_segments.correct_overlaps_index_field(only_start_tags=True)
66 |
67 | self.adapt_stop_index_in_last_segment(all_file_segments)
68 |
69 |
70 |
71 |
72 | ocromore_data['segmentation'] = all_file_segments
73 |
74 | return ocromore_data
75 |
76 |
77 | def adapt_stop_index_in_last_segment(self, all_file_segments):
78 | """
79 | Sets the stop_index for the last recognized segment, which
80 | is a special case and is usually not filled beforehand, because
81 | there is no next start index
82 | :param all_file_segments: holder object for segment classes and other info
83 | :return: None
84 | """
85 |
86 | # search for last segment
87 | saved_start_index = -1
88 | saved_last_segment = None
89 | for segment in all_file_segments.my_classes:
90 | # only count segmented segments
91 | if segment.start_was_segmented is False:
92 | continue
93 |
94 | if segment.start_line_index >= saved_start_index:
95 | saved_start_index = segment.start_line_index
96 | saved_last_segment = segment
97 |
98 | if saved_last_segment is None:
99 | return
100 |
101 | # adapt the last stop index of last segment
102 | saved_last_segment.stop_line_index = all_file_segments.number_of_lines-1
103 | saved_last_segment.stop_was_segmented = True # todo think about if this is necessary?
104 |
105 |
106 |
107 |
108 |
109 | def adapt_non_explicit_indices(self, all_file_segments):
110 |
111 | # update start and explicit stop tags first
112 | all_file_segments.correct_overlaps_index_field(only_start_tags=True)
113 |
114 | # fill undefined stop regions until next start region
115 | all_file_segments.fill_start_index_until_next_stop()
116 |
117 |
118 | class AllSegments(object):
119 | """
120 | Accessor class for the segmentation of a file
121 | """
122 |
123 | def __init__(self, number_of_lines, cpr, config):
124 | # init all internal-classification classes
125 | self.index_field = []
126 | self.my_classes = []
127 | self.my_only_indices = []
128 | self.instantiate_classification_classes()
129 | self.number_of_lines = number_of_lines
130 | self.initialize_index_field(number_of_lines)
131 | self.cpr = cpr
132 | self.config = config
133 | self.get_only_classes()
134 |
135 | def get_only_classes(self):
136 | """
137 | Get all classes which are tagged by the only flag
138 | :return:
139 | """
140 | for segment_index, segment_class in enumerate(self.my_classes):
141 | if segment_class.only is True:
142 | self.my_only_indices.append(segment_index)
143 |
144 | if len(self.my_only_indices) >= 1:
145 | self.cpr.print("using only indices, since there is at least one class set to only")
146 |
147 | def initialize_index_field(self, number_of_lines):
148 | self.index_field = []
149 |
150 | for ctr in range(0, number_of_lines):
151 | self.index_field.append(False)
152 |
153 | def correct_overlaps_index_field(self, only_start_tags=False):
154 | """
155 | Debugging function to correct areas which are overlapping with stop taq the next start tag
156 | Attention: This reinitializes (overwrites) the existing index field
157 | :return:
158 | """
159 |
160 | # reinitialize index field
161 | self.initialize_index_field(self.number_of_lines)
162 |
163 | # iterate classes - this not using only classes cause it's more for bigger sets of classes
164 | for segment_class_index, segment_class in enumerate(self.my_classes):
165 | if not segment_class.enabled:
166 | continue
167 | # todo check here ok ?
168 | self.update_index_field(segment_class, only_start_tags=True)
169 |
170 | if only_start_tags is True:
171 | return self
172 |
173 | # iterate again and update the stop tags in manner that they are only updated until the next start tag
174 | for segment_class_index, segment_class in enumerate(self.my_classes):
175 | if not segment_class.enabled:
176 | continue
177 | if not segment_class.is_start_segmented():
178 | continue
179 |
180 | self.update_stop_tags(segment_class)
181 |
182 |
183 | return self
184 |
185 | def fill_start_index_until_next_stop(self):
186 | """
187 | Fills all segments start to next segments stop, if they don't have explicitly defined stop tags
188 | Adapts index field and the segment stop properties
189 | :return:
190 | """
191 | for segment_class_index, segment_class in enumerate(self.my_classes):
192 | if not segment_class.enabled:
193 | continue
194 | if segment_class.is_start_segmented() is False:
195 | # the segment wasn't found at all so no filling needed
196 | continue
197 | if segment_class.is_stop_segmented() is True:
198 | # class already has stop and therefore doesn't need to be filled
199 | continue
200 |
201 | # search until next found tag
202 | for index in range(segment_class.start_line_index+1, len(self.index_field)):
203 | current_field_item = self.index_field[index]
204 | if current_field_item is not False:
205 | # next item begins, done with filling
206 | segment_class.set_stop_segmented(index-1) # toggles stop_segmented, sets index
207 | break
208 | else:
209 | # field item is False, fill with the current segment tag
210 | self.index_field[index] = segment_class.segment_tag
211 |
212 |
213 | def update_index_field(self, segmentation_class, only_start_tags=False):
214 | segment_tag = segmentation_class.segment_tag
215 | start_line_index = segmentation_class.start_line_index
216 | stop_line_index = segmentation_class.stop_line_index
217 |
218 | # if no start condition set - no update
219 | if start_line_index == -1:
220 | return
221 |
222 | # if start condition but no endcondition just update 1st line
223 | if stop_line_index == -1:
224 | stop_line_index = start_line_index + 1
225 |
226 | # fix some index glitches
227 | if start_line_index > stop_line_index:
228 | stop_line_index = start_line_index
229 |
230 | if start_line_index == stop_line_index:
231 | stop_line_index = start_line_index + 1
232 |
233 | # special option for debugging purposes
234 | if only_start_tags is True:
235 | stop_line_index = start_line_index
236 |
237 | for index in range(start_line_index, stop_line_index+1):
238 | self.index_field[index] = segment_tag
239 |
240 | def update_stop_tags(self, segmentation_class):
241 | segment_tag = segmentation_class.segment_tag
242 | start_line_index = segmentation_class.start_line_index
243 | stop_line_index = segmentation_class.stop_line_index
244 | index_field_len = len(self.index_field)
245 | # if segment_tag is "Verwaltung":
246 | # print("aqd")
247 |
248 | for index in range(start_line_index+1, index_field_len):
249 |
250 | # update until the next defined field appeads
251 | if self.index_field[index] is not False:
252 | break
253 |
254 | self.index_field[index] = segment_tag
255 |
256 | def instantiate_classification_classes(self):
257 | dict_test = SegmentHolder.__dict__.items()
258 |
259 | for key, value in dict_test:
260 | if inspect.isclass(value):
261 | my_instance = value()
262 | self.my_classes.append(my_instance)
263 |
264 | def finish_segment_matching(self, lines, feats, file_info):
265 | """
266 | Final step in segmentation, covers special segmentation cases which i.e. can be done
267 | after everything else is segmented.
268 | :param lines:
269 | :param feats:
270 | :param file_info:
271 | :return:
272 | """
273 |
274 | # special case: in end match firmenname
275 | for segment_class_index, segment_class in enumerate(self.my_classes):
276 | if not isinstance(segment_class, SegmentHolder.SegmentFirmenname):
277 | continue # skip firmenname at firsthand, this will be matched in the end
278 |
279 | start_updated = segment_class.match_start_condition(lines, lines, self.index_field, feats, len(lines), file_info,None)
280 |
281 | start_updated = False # self.number_of_lines, prev_line, combined_line)
282 | if start_updated:
283 | # there was a change -> update the indices fields
284 | self.update_index_field(segment_class)
285 |
286 | break # this only occurs once
287 |
288 |
289 | # overall function for iterating over all matches
290 | def match_my_segments(self, line, line_text, line_index, features, prev_line, combined_line):
291 |
292 | # 'only'-tagged class usage
293 | using_only_classes = False
294 | if len(self.my_only_indices) >= 1:
295 | using_only_classes = True
296 |
297 | # iterate classes
298 | for segment_class_index, segment_class in enumerate(self.my_classes):
299 | if not segment_class.enabled:
300 | continue
301 |
302 | if using_only_classes:
303 | # if at least one class was tagged only, skip all other classes who are only tagged
304 | if segment_class_index not in self.my_only_indices:
305 | continue
306 |
307 |
308 | if isinstance(segment_class, SegmentHolder.SegmentFirmenname) :
309 | continue # skip firmenname at firsthand, this will be matched in the end
310 |
311 |
312 | start_updated = False
313 | stop_updated = False
314 |
315 |
316 | if self.config.REMATCH_START_CONDITION_UNTIL_ZERO_ERROR is True:
317 | # do segmenting until error rate of zero is reached
318 | start_error_number_before_match = segment_class.get_start_error_number()
319 | if not segment_class.is_start_segmented() or segment_class.get_start_error_number() >= 1:
320 | start_updated = segment_class.match_start_condition(line, line_text, line_index, features,
321 | self.number_of_lines, prev_line, combined_line)
322 | start_error_number_after_match = segment_class.get_start_error_number()
323 | if start_error_number_before_match <= start_error_number_after_match:
324 | # only update if the recognized number is lower
325 | start_updated = False
326 |
327 | stop_error_number_before_match = segment_class.get_stop_error_number()
328 | if not segment_class.is_stop_segmented() or segment_class.get_stop_error_number() >= 1:
329 | stop_updated = segment_class.match_stop_condition(line, line_text, line_index, features,
330 | self.number_of_lines, prev_line, combined_line)
331 | stop_error_number_after_match = segment_class.get_stop_error_number()
332 | if stop_error_number_before_match <= stop_error_number_after_match:
333 | # only update if the recognized number is lower
334 | stop_updated = False
335 |
336 | else:
337 | # just hit the first match and stop matching then -> standard mode
338 | if not segment_class.is_start_segmented():
339 | start_updated = segment_class.match_start_condition(line, line_text, line_index, features,
340 | self.number_of_lines, prev_line, combined_line)
341 | if not segment_class.is_stop_segmented():
342 | stop_updated = segment_class.match_stop_condition(line, line_text, line_index, features,
343 | self.number_of_lines, prev_line, combined_line)
344 |
345 | if start_updated or stop_updated:
346 |
347 | if stop_updated:
348 | start_line_index = segment_class.start_line_index
349 | stop_line_index = segment_class.stop_line_index
350 | for segment in self.my_classes:
351 | if type(segment) == type(segment_class):
352 | continue
353 | current_start_line_index = segment.start_line_index
354 | current_stop_line_index = segment.stop_line_index
355 |
356 | if current_start_line_index != -1 and (current_start_line_index >= start_line_index and current_start_line_index <=stop_line_index):
357 | segment.set_start_segmented(-1)
358 | segment.start_was_segmented = False
359 | if current_stop_line_index != -1 and (current_stop_line_index >= start_line_index and current_stop_line_index <=stop_line_index):
360 | segment.set_stop_segmented(-1)
361 | segment.stop_was_segmented = False
362 |
363 |
364 | # there was a change -> update the indices fields
365 | self.update_index_field(segment_class)
366 |
367 |
368 |
369 |
--------------------------------------------------------------------------------
/.idea/dbnavigator.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
--------------------------------------------------------------------------------
/lib/akf_parsing_functions_one.py:
--------------------------------------------------------------------------------
1 | from akf_corelib.conditional_print import ConditionalPrint
2 | from akf_corelib.configuration_handler import ConfigurationHandler
3 | from .data_helper import DataHelper as dh
4 | from .akf_parsing_functions_common import AKFCommonParsingFunctions as cf
5 |
6 | import regex
7 |
8 |
9 | class AkfParsingFunctionsOne(object):
10 |
11 | def __init__(self, endobject_factory, output_analyzer, dictionary_handler):
12 | config_handler = ConfigurationHandler(first_init=False)
13 |
14 | self.config = config_handler.get_config()
15 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_ONE, self.config.PRINT_EXCEPTION_LEVEL,
16 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
17 |
18 | self.cpr.print("init akf parsing functions one")
19 |
20 | self.ef = endobject_factory
21 | self.output_analyzer = output_analyzer
22 | self.dictionary_handler = dictionary_handler
23 |
24 |
25 | def parse_firmenname(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
26 | # get basic data
27 | element_counter = 0
28 |
29 | origpost, origpost_red, element_counter, content_texts = \
30 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
31 |
32 | # get relevant info
33 | accumulated_text = ""
34 | for text in content_texts:
35 | accumulated_text += " " + text
36 |
37 | only_add_if_value = False
38 | accumulated_text = accumulated_text.strip()
39 | self.ef.add_to_my_obj("Firmenname", accumulated_text, object_number=element_counter, only_filled=only_add_if_value)
40 |
41 |
42 | def parse_sitz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
43 | """
44 | "Sitz": [
45 | {
46 | "origpost": "Mergenthalerallee 79-81, 65760 Eschborn Telefon:(069) 7 50 06-0 Telefax:(069) 7 50 06-111 e-mail:info@3u.net Internetseite:http://www.3u.net ",
47 | "type": "Sitz",
48 | "street": "Mergenthalerallee",
49 | "street_number": "79-81",
50 | "zip": "65760",
51 | "city": "Eschborn",
52 | "phone": "(069) 7 50 06-0",
53 | "fax": "(069) 7 50 06-111",
54 | "email": [
55 | "info@3u.net"
56 | ],
57 | "www": [
58 | "http://www.3u.net"
59 | ]
60 | }
61 | ],
62 | """
63 | # get basic data
64 | element_counter = 0
65 |
66 | origpost, origpost_red, element_counter, content_texts = \
67 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
68 |
69 | # get relevant info
70 | num_id, city, street, street_number, additional_info = cf.parse_id_location(origpost_red)
71 |
72 | # add stuff to ef
73 | only_add_if_value = True
74 | self.ef.add_to_my_obj("numID", num_id, object_number=element_counter, only_filled= only_add_if_value)
75 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled= only_add_if_value)
76 | self.ef.add_to_my_obj("street", street, object_number=element_counter, only_filled= only_add_if_value)
77 | self.ef.add_to_my_obj("street_number", street_number, object_number=element_counter, only_filled= only_add_if_value)
78 | self.ef.add_to_my_obj("additional_info", additional_info, object_number=element_counter, only_filled= only_add_if_value)
79 |
80 | return True
81 |
82 | def parse_verwaltung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
83 | # kmy_obj_2 = self.ef.print_me_and_return()
84 | # get basic data
85 | element_counter = 0
86 | origpost, origpost_red, element_counter, content_texts = \
87 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
88 |
89 | # logme
90 | # self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)
91 |
92 | if "srat" in real_start_tag:
93 | # Verwaltungsrat ..
94 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler,
95 | self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)
96 | only_add_if_filed = True
97 | for entry in persons_final:
98 | name, first_name, last_name, city, title, funct, rest_info = entry
99 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed)
100 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter,
101 | only_filled=only_add_if_filed)
102 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter,
103 | only_filled=only_add_if_filed)
104 |
105 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed)
106 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed)
107 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed)
108 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed)
109 |
110 | element_counter += 1
111 | return True
112 | elif "Verw." in real_start_tag:
113 | # Verw.
114 | num_id, city, street, street_number, additional_info = cf.parse_id_location(origpost_red)
115 |
116 | # add stuff to ef
117 | only_add_if_value = True
118 | self.ef.add_to_my_obj("numID", num_id, object_number=element_counter, only_filled=only_add_if_value)
119 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value)
120 | self.ef.add_to_my_obj("street", street, object_number=element_counter, only_filled=only_add_if_value)
121 | self.ef.add_to_my_obj("street_number", street_number, object_number=element_counter,
122 | only_filled=only_add_if_value)
123 | self.ef.add_to_my_obj("additional_info", additional_info, object_number=element_counter,
124 | only_filled=only_add_if_value)
125 |
126 | return True
127 | else:
128 | # Verwaltung
129 | final_items = cf.parse_general_and_keys(content_texts,
130 | join_separated_lines=False,
131 | current_key_initial_value="General_Info")
132 | for key in final_items.keys():
133 | value = final_items[key]
134 | if value is None or value == "":
135 | continue
136 | self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=True)
137 | element_counter += 1
138 | return True
139 |
140 | def parse_telefon_fernruf(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
141 |
142 | # get basic data
143 | origpost, origpost_red, element_counter, content_texts = cf.add_check_element(self, content_texts,
144 | real_start_tag, segmentation_class, 0)
145 | # do special match: Verwaltung und Betriebshof
146 | split_post = []
147 |
148 | match_special = regex.match(r"(?Verwaltung.*)"
149 | r"(?Betriebshof.*)"
150 | , origpost_red)
151 | if match_special:
152 | betriebshof = match_special.group("Betr")
153 | verwaltung = match_special.group("Verw")
154 | origpost_red = origpost_red.replace(betriebshof, "")
155 | origpost_red = origpost_red.replace(verwaltung, "")
156 | split_post.append(betriebshof)
157 | split_post.append(verwaltung)
158 | # do special match: Ortsgespräche and Ferngespräche
159 |
160 | match_special2 = regex.match(r"(?Ortsgespräche.*)"
161 | r"(?Ferngespräche.*)"
162 | , origpost_red)
163 | if match_special2:
164 | ortsgespr = match_special2.group("og")
165 | ferngespr = match_special2.group("fg")
166 | origpost_red = origpost_red.replace(ortsgespr, "")
167 | origpost_red = origpost_red.replace(ferngespr, "")
168 | split_post.append(ortsgespr)
169 | split_post.append(ferngespr)
170 |
171 |
172 |
173 | # do special match: Ortsverkehr and Fernverkehr
174 |
175 | match_special3 = regex.match(r"(?Ortsverkehr.*)"
176 | r"(?Fernverkehr.*)"
177 | , origpost_red)
178 | if match_special3:
179 | ortsverkehr = match_special3.group("ov")
180 | fernverkehr = match_special3.group("fv")
181 | origpost_red = origpost_red.replace(ortsverkehr, "")
182 | origpost_red = origpost_red.replace(fernverkehr, "")
183 | split_post.append(ortsverkehr)
184 | split_post.append(fernverkehr)
185 |
186 | # do special match: check if only numbers
187 | origpost_red_new = origpost_red
188 | #only_num_check = origpost_red.replace("und", "").replace(",", "").replace(" ", "")
189 | test_split = regex.split("\su\.|\sund\s|,|;", origpost_red)
190 | for number in test_split:
191 | # additional parenthesis block
192 | match_parenthesis = regex.search("\(.*\)", number)
193 | parenthesis = None
194 | if match_parenthesis:
195 | parenthesis = match_parenthesis.group()
196 | number = number.replace(parenthesis,"") # remove number
197 | self.ef.add_to_my_obj("vorwahl", parenthesis, object_number=element_counter, only_filled=True)
198 |
199 |
200 | match_word_num = regex.search("(?[^\d]*)(?[\d\s\-/]*)", number)
201 | if match_word_num is None:
202 | continue
203 |
204 | word = match_word_num.group("word")
205 | num = match_word_num.group("num")
206 | if "Sa." in word and "Nr" in word:
207 | continue
208 | number_stripped = num.strip(" ./").replace("/", "").replace("-", "").replace(" ", "")
209 | if number_stripped.isdigit():
210 | origpost_red_new = origpost_red_new.replace(number, "") # remove number
211 | origpost_red_new = origpost_red_new.replace(word, "") # remove word found
212 |
213 | change1 = self.ef.add_to_my_obj("number_Sa.-Nr.", num.strip(), object_number=element_counter, only_filled=True)
214 | change2 = self.ef.add_to_my_obj("location", word.strip(), object_number=element_counter, only_filled=True)
215 | if change1 or change2:
216 | element_counter += 1
217 |
218 | #if "32 20 47" in origpost_red:
219 | # print("asd")
220 |
221 | origpost_red = origpost_red_new
222 | # substitute in a separator char to integrate delimiters in next step
223 | origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red)
224 |
225 | # do further matches (sc-separated)
226 | split_post.extend(regex.split(';|~~~~|\su\.', origpost_red))
227 |
228 | for index, entry in enumerate(split_post):
229 | if entry is None:
230 | continue
231 | entry_stripped = entry.strip()
232 | if entry_stripped == "":
233 | continue
234 |
235 | # additional parenthesis block
236 | match_parenthesis = regex.search("\(.*\)", entry_stripped)
237 | parenthesis = None
238 | if match_parenthesis:
239 | parenthesis = match_parenthesis.group()
240 | entry_stripped = entry_stripped.replace(parenthesis, "") # remove entry
241 | self.ef.add_to_my_obj("vorwahl", parenthesis, object_number=element_counter, only_filled=True)
242 |
243 |
244 |
245 | match_word = regex.match(r"(?\D*)"
246 | r"(?[\d\s\W]*)"
247 | ,entry_stripped)
248 | if match_word is not None:
249 | # fetch match results
250 | tag_match = match_word.group("Tag")
251 | numbers_match = match_word.group("Numbers")
252 | rest_from_entry_str = entry_stripped.replace(tag_match, "", 1)
253 | rest_from_entry_str = rest_from_entry_str.replace(numbers_match, "", 1)
254 |
255 | tag = dh.strip_if_not_none(tag_match, "")
256 | match_tag = regex.match(r"(?.*)(?Sa\.?\-Nr\.?)(?.*)", tag)
257 | location = ""
258 | if match_tag is not None:
259 | rest_tag = match_tag.group('rest_bef')
260 | rest_tag_2 = match_tag.group('rest_end')
261 | # sanr = match_tag.group('sanr') # this is the filtered group
262 | location = dh.strip_if_not_none(rest_tag + " " + rest_tag_2, ":., ")
263 | else:
264 | # if there are no real descriptors in tag then tag is usually location (like Düsseldorf 1 36 62.)
265 | location = tag
266 |
267 | if "und" in location:
268 | location = regex.sub("[^\w]und[^\w]", "", location)
269 |
270 | number = dh.strip_if_not_none(numbers_match, "., ")
271 | self.ef.add_to_my_obj("number_Sa.-Nr.", number.strip(), object_number=element_counter, only_filled=True)
272 | self.ef.add_to_my_obj("location", location.strip(), object_number=element_counter, only_filled=True)
273 | additional_info_entry_level = dh.strip_if_not_none(rest_from_entry_str, ",. ")
274 | self.ef.add_to_my_obj("additional_info", additional_info_entry_level.strip(),
275 | object_number=element_counter, only_filled=True)
276 | element_counter += 1
277 |
278 | origpost_red = origpost_red.replace(number, "", 1)
279 | origpost_red = origpost_red.replace(location, "", 1)
280 |
281 | origpost_red = origpost_red.replace("Sa.-Nr", "").replace("~~~~", "")
282 | origpost_red_end = dh.remove_multiple_outbound_chars(origpost_red)
283 |
284 | if len(origpost_red_end) > 3:
285 | self.ef.add_to_my_obj("additional_info_unparsed", origpost_red_end.strip(), object_number=element_counter)
286 |
287 | def parse_vorstand(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
288 |
289 | # get basic data
290 | element_counter = 0
291 | origpost, origpost_red, element_counter, content_texts = \
292 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
293 |
294 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler,
295 | self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)
296 |
297 | only_add_if_filed = True
298 | for entry in persons_final:
299 | name, first_name, last_name, city, title, funct, rest_info = entry
300 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed)
301 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter,
302 | only_filled=only_add_if_filed)
303 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed)
304 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed)
305 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed)
306 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed)
307 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed)
308 | element_counter += 1
309 | """
310 | # do matches (;-separated)
311 | split_post = origpost_red.split(';')
312 |
313 | for index, entry in enumerate(split_post):
314 | entry_stripped = entry.strip()
315 |
316 | if index == len(split_post)-1:
317 | matchend = regex.match("^[Aa]lle", entry_stripped)
318 | if matchend:
319 | self.ef.add_to_my_obj("additional_info", entry_stripped, object_number=element_counter)
320 | element_counter += 1
321 | continue
322 |
323 | match = regex.match(r"(?.*)[,]" # find location string
324 | r"(?.*+)", # just get the rest which is usually streetname and number, but has other possibilities
325 | entry_stripped)
326 | if match is None:
327 | name = dh.strip_if_not_none(entry_stripped, ", ")
328 | self.ef.add_to_my_obj("name", name, object_number=element_counter)
329 | element_counter += 1
330 | continue
331 |
332 | name = dh.strip_if_not_none(match.group("Name"), ", ")
333 | rest = dh.strip_if_not_none(match.group("Rest"), ",. ")
334 | name_split = name.split(',')
335 | if len(name_split) > 1:
336 | position = rest
337 | name = name_split[0]
338 | city = name_split[1]
339 | else:
340 | city = rest
341 | position = ""
342 |
343 | self.ef.add_to_my_obj("name", name, object_number=element_counter)
344 | self.ef.add_to_my_obj("city", city, object_number=element_counter)
345 | self.ef.add_to_my_obj("position", position, object_number=element_counter)
346 | element_counter += 1
347 | """
348 |
349 | return True
350 |
351 | def parse_aufsichtsrat(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
352 |
353 | # get basic data
354 | element_counter = 0
355 | origpost, origpost_red, element_counter, content_texts = \
356 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
357 |
358 | #Try to fix +) problems
359 | origpost_red = origpost_red.replace("; +)","+);").replace(";+)","+);").replace("')","").replace("*)","")
360 |
361 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)
362 |
363 | only_add_if_filed = True
364 | for entry in persons_final:
365 | name, first_name, last_name, city, title, funct, rest_info = entry
366 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled= only_add_if_filed)
367 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled= only_add_if_filed)
368 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled= only_add_if_filed)
369 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled= only_add_if_filed)
370 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled= only_add_if_filed)
371 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled= only_add_if_filed)
372 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed)
373 | element_counter += 1
374 |
375 |
376 | return True
377 |
378 | def parse_arbeitnehmervertreter(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
379 | # get basic data
380 | element_counter = 0
381 | origpost, origpost_red, element_counter, content_texts = \
382 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
383 |
384 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)
385 | only_add_if_filed = True
386 | for entry in persons_final:
387 | name, first_name, last_name, city, title, funct, rest_info = entry
388 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled= only_add_if_filed)
389 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled= only_add_if_filed)
390 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled= only_add_if_filed)
391 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled= only_add_if_filed)
392 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled= only_add_if_filed)
393 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled= only_add_if_filed)
394 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed)
395 |
396 | element_counter += 1
397 |
398 | return True
399 |
400 | # Gruendung
401 | def parse_gruendung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
402 | # get basic data
403 | element_counter = 0
404 | origpost, origpost_red, element_counter, content_texts = \
405 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
406 | match_year = regex.search("^\d*", origpost_red.strip())
407 | if match_year:
408 | result = match_year.group()
409 | origpost_red_new = origpost_red.replace(result, "", 1)
410 | year = dh.strip_if_not_none(result, ".,() ")
411 | rest_info = dh.strip_if_not_none(origpost_red_new, ".,() ")
412 | self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=True)
413 | self.ef.add_to_my_obj("year", year, object_number=element_counter, only_filled=True)
414 | else:
415 | rest_info = dh.strip_if_not_none(origpost_red, ".,() ")
416 | self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=True)
417 |
418 | # Tätigkeitsgebiet
419 | def parse_taetigkeitsgebiet(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
420 | # get basic data
421 | element_counter = 0
422 | origpost, origpost_red, element_counter, content_texts = \
423 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
424 |
425 | final_items = cf.parse_general_and_keys(content_texts,
426 | join_separated_lines=False,
427 | current_key_initial_value="General_Info")
428 |
429 | for key in final_items.keys():
430 | value = final_items[key]
431 | if value is None or len(value) == 0:
432 | continue
433 | self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=True)
434 | element_counter += 1
--------------------------------------------------------------------------------