├── configuration ├── __init__.py ├── config_parse_hocr_jk.conf └── config_parse_hocr_js.conf ├── logs └── var_occurences.json ├── docs └── img │ └── docxstruct_logo.png ├── .idea ├── encodings.xml ├── misc.xml ├── modules.xml ├── vcs.xml ├── akf-hocrparser.iml ├── workspace.xml ├── codeStyles │ └── Project.xml └── dbnavigator.xml ├── experiments ├── experiments_loop.py ├── experiments_strip.py ├── experiments_number_sizes.py └── experiment_fuzzy_regex.py ├── .gitmodules ├── tests ├── regex_fuzzy_search.py └── strip_if_not_none.py ├── .gitignore ├── lib ├── akf_known_uncategories.py ├── akf_parsing_functions_tables_one.py ├── dictionary_handler.py ├── segment.py ├── additional_info_handler.py ├── snippet_ocr.py ├── akf_parsing_functions_jk.py ├── feature_extractor.py ├── segment_parser.py ├── segment_parser_endobject_factory.py ├── data_helper.py ├── segment_classifier.py └── akf_parsing_functions_one.py ├── additionals └── dictionaries │ ├── dictionary_income.json │ └── dictionary_balance.json ├── main_start.py ├── parser.py ├── LICENSE └── README.md /configuration/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /logs/var_occurences.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/img/docxstruct_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UB-Mannheim/docxstruct/master/docs/img/docxstruct_logo.png -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /experiments/experiments_loop.py: -------------------------------------------------------------------------------- 1 | all_texts = ["asd", "fgh"] 2 | 3 | for text in all_texts: 4 | print("text:", text) 5 | all_texts.append(text) 6 | 7 | print("done", all_texts) 8 | -------------------------------------------------------------------------------- /experiments/experiments_strip.py: -------------------------------------------------------------------------------- 1 | test_text = "This is a test.,.,, " 2 | print("test_text", test_text) 3 | stripped_text = test_text.strip("., ") 4 | print("stripped_text", stripped_text) -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "akf_corelib"] 2 | path = akf_corelib 3 | url = https://github.com/UB-Mannheim/akf-corelib.git 4 | [submodule "hocr_parser"] 5 | path = hocr_parser 6 | url = https://github.com/UB-Mannheim/hocr_parser.git 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /tests/regex_fuzzy_search.py: -------------------------------------------------------------------------------- 1 | #todo add testing and check for regex fuzzy search implementation (with error_number correctness) 2 | 3 | from akf_corelib.regex_util import RegexUtil as regu 4 | 5 | 6 | text = "my test text" 7 | match, errs = regu.fuzzy_search(r"", text, err_number=0) -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/akf-hocrparser.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /experiments/experiments_number_sizes.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def show_sizeof(x, level=0): 5 | 6 | print("\t" * level, x.__class__, sys.getsizeof(x), x) 7 | 8 | if hasattr(x, '__iter__'): 9 | if hasattr(x, 'items'): 10 | for xx in x.items(): 11 | show_sizeof(xx, level + 1) 12 | else: 13 | for xx in x: 14 | show_sizeof(xx, level + 1) 15 | 16 | 17 | show_sizeof(None) 18 | show_sizeof(3) 19 | show_sizeof(2**63) 20 | show_sizeof(102947298469128649161972364837164) 21 | show_sizeof(918659326943756134897561304875610348756384756193485761304875613948576297485698417) 22 | 23 | print("One variable test") 24 | gets_bigger = 3 25 | show_sizeof(gets_bigger) 26 | 27 | gets_bigger += 102947298469128649161972364837164 28 | show_sizeof(gets_bigger) 29 | 30 | 31 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | 17 | 18 | 19 | 20 | 21 | 22 | 1532015875915 23 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /tests/strip_if_not_none.py: -------------------------------------------------------------------------------- 1 | from lib.data_helper import DataHelper as dh 2 | 3 | 4 | # single trail sc 5 | test_text_1 = "this is my text)" 6 | test_result_1 = dh.strip_if_not_none(test_text_1, ")., ") 7 | test_result_1s = test_text_1.strip(")., ") 8 | test_result_1r = dh.remove_multiple_outbound_chars(test_text_1) 9 | 10 | # multi trail sc 11 | test_text_2 = "this is my text)..." 12 | test_result_2 = dh.strip_if_not_none(test_text_2, ")., ") 13 | test_result_2s = test_text_2.strip(")., ") 14 | test_result_2r = dh.remove_multiple_outbound_chars(test_text_2) 15 | 16 | 17 | # single start sc multi trail sc 18 | test_text_3 = ")this is my text)..." 19 | test_result_3 = dh.strip_if_not_none(test_text_3, ")., ") 20 | test_result_3s = test_text_3.strip(")., ") 21 | test_result_3r = dh.remove_multiple_outbound_chars(test_text_3) 22 | 23 | # multi start sc multi trail sc 24 | test_text_4 = ")....this is my text)..." 25 | test_result_4 = dh.strip_if_not_none(test_text_4, ")., ") 26 | test_result_4s = test_text_4.strip(")., ") 27 | test_result_4r = dh.remove_multiple_outbound_chars(test_text_4) 28 | 29 | 30 | # with spaces 31 | test_text_5 = ").. ..this is my text). .." 32 | test_result_5 = dh.strip_if_not_none(test_text_5, ")., ") 33 | test_result_5s = test_text_5.strip(")., ") 34 | test_result_5r = dh.remove_multiple_outbound_chars(test_text_5) 35 | 36 | 37 | # non-pattern break 38 | test_text_6 = ").(...this is my text).(.." 39 | test_result_6 = dh.strip_if_not_none(test_text_6, ")., ") 40 | test_result_6s = test_text_6.strip(")., ") 41 | test_result_6r = dh.remove_multiple_outbound_chars(test_text_6) 42 | 43 | print("done") 44 | 45 | 46 | 47 | # strip for comparison 48 | 49 | test_strip = " u." 50 | test_strip_1 = test_strip.strip(". ") 51 | 52 | 53 | print("done2") 54 | 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | output/ 2 | output_save/ 3 | laptopdata/ 4 | AKFII_ocromore_results_local/ 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | -------------------------------------------------------------------------------- /experiments/experiment_fuzzy_regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | import regex # backwards compatible to 're', but additional functionality 3 | # https://pypi.org/project/regex/ ---> 'fuzzy'-matches 4 | from akf_corelib.regex_util import RegexUtil as regu 5 | 6 | test_texts = [ 7 | "Fernschreiber:", 8 | "Fernschreiber :", 9 | "F3rnschreiber:", 10 | "F3pnschreiber:", 11 | "ernschreiber:", 12 | "ernschr3iber:", 13 | "Fernschreiber!", 14 | "asdwevc!" 15 | ] 16 | 17 | 18 | example = regex.fullmatch(r"(?:cats|cat){e<=1}", "cat").fuzzy_counts 19 | print("Example is:", example) 20 | 21 | def regexfuzzy_search(pattern, text ,err_number=2): 22 | compiled_wrapper = regex.compile(r"(?:"+pattern+"){e<="+str(err_number)+"}") 23 | result = compiled_wrapper.search(text) 24 | return result 25 | 26 | 27 | # costs of insert, delete, substitute can be defined {2i+2d+1s<=4} each insertion costs 2 etc 28 | def test_1(): 29 | for text in test_texts: 30 | compiled = regex.compile(r"(?:^Fernschreiber\s?:){e<=1}") 31 | match_stop = compiled.search(text) 32 | if match_stop is not None: 33 | (substs, inserts, deletions) = match_stop.fuzzy_counts 34 | accumulated_errs = substs + inserts + deletions 35 | 36 | print("Text is:", text, "Match is True", "Errors:", (substs, inserts, deletions) ) 37 | else: 38 | print("Text is:", text, "Match is False", "Errors: higher than limit") 39 | 40 | 41 | # search with dynamic wrapper function (better looking regex) 42 | for text in test_texts: 43 | match_stop = regexfuzzy_search("^Fernschreiber\s:", text) 44 | if match_stop is not None: 45 | (substs, inserts, deletions) = match_stop.fuzzy_counts 46 | accumulated_errs = substs + inserts + deletions 47 | 48 | print("Text is:", text, "Match is True", "Errors:", (substs, inserts, deletions)) 49 | else: 50 | print("Text is:", text, "Match is False", "Errors: higher than limit") 51 | 52 | 53 | 54 | match_shorter_text, errs = regu.fuzzy_search("^Texte", "Text", err_number=2) 55 | #if match_shorter_text: 56 | #result = match_shorter_text.text 57 | 58 | # jk example 59 | match_shorter_text2, errs2 = regu.fuzzy_search("^rückstellungen$", "rücksstellungen", err_number=2) 60 | if match_shorter_text2: 61 | result = match_shorter_text2.text -------------------------------------------------------------------------------- /lib/akf_known_uncategories.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class KnownUncategories(object): 4 | """ 5 | List of known entries in test_data which are no categories, 6 | but are recognized as such 7 | """ 8 | 9 | def __init__(self): 10 | 11 | # un-category regex strings (care for commas) 12 | self.uc = [ 13 | "Beteiligung", # 1956: is part of Beteiligungen 14 | "Ferngespräche", # 1956: is part of Fernruf/Telefon 15 | "Kapital", # 1956: is part of multiple top-level items 16 | "Umstellung \d\d?", # 1956: is part of Grundkapital or other 17 | "Dividenden ab \d{4}.*", # 1956: is part of Dividenden or other (with year or yearspan)s 18 | "^Kurs.*", # 1956: second level tag 19 | "ab \d{4}(\/\d{2})?" # 1956: i.e "ab 1949/50"-part of other categories 20 | ] 21 | 22 | # non-specific keys (which get not removed from original-rest in analysis) 23 | self.nkeys = [ 24 | "street", 25 | "street_number", 26 | "additional_info", 27 | "city", 28 | "name", 29 | "title", 30 | "rest", 31 | "location", 32 | "number_Sa.-Nr.", 33 | "rest_info", 34 | "bank", 35 | "title", 36 | "amount", 37 | "ord_number", 38 | "organization", 39 | 40 | ] 41 | 42 | # create corresponding regexes 43 | self.uc_regex = [] 44 | for item in self.uc: 45 | regex_compiled = re.compile(item) 46 | self.uc_regex.append(regex_compiled) 47 | 48 | @property 49 | def uncategories(self): 50 | return self.uc 51 | 52 | @property 53 | def unkeys(self): 54 | return self.nkeys 55 | 56 | def check_uncategories(self, text_to_check): 57 | """ 58 | Allows to compare a tag against the existing uncategories 59 | :param text_to_check: tag text 60 | :return: True if un-category, False if not 61 | """ 62 | for regex_to_check in self.uc_regex: 63 | match_result = regex_to_check.search(text_to_check) 64 | if match_result is not None: 65 | return True 66 | 67 | return False -------------------------------------------------------------------------------- /.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | 13 | 14 | 15 | 21 | 22 | 26 | 27 | 28 | 29 | 35 | 36 | 37 | 38 | 39 | 45 | 46 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /lib/akf_parsing_functions_tables_one.py: -------------------------------------------------------------------------------- 1 | from akf_corelib.conditional_print import ConditionalPrint 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | from .data_helper import DataHelper as dh 4 | from .akf_parsing_functions_common import AKFCommonParsingFunctions as cf 5 | from akf_corelib.regex_util import RegexUtil as regu 6 | 7 | import regex 8 | 9 | 10 | class AkfParsingFunctionsTablesOne(object): 11 | 12 | def __init__(self, endobject_factory, output_analyzer, dictionary_handler): 13 | config_handler = ConfigurationHandler(first_init=False) 14 | 15 | self.config = config_handler.get_config() 16 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_TABLES_ONE, self.config.PRINT_EXCEPTION_LEVEL, 17 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) 18 | 19 | self.cpr.print("init akf parsing functions tables one") 20 | 21 | self.ef = endobject_factory 22 | self.output_analyzer = output_analyzer 23 | self.dictionary_handler = dictionary_handler 24 | 25 | 26 | def parse_aktienkurse(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 27 | # get basic data 28 | element_counter = 0 29 | origpost, origpost_red, element_counter, content_texts = \ 30 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 31 | 32 | # logme 33 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) 34 | 35 | 36 | def parse_dividenden(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 37 | # get basic data 38 | element_counter = 0 39 | origpost, origpost_red, element_counter, content_texts = \ 40 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 41 | 42 | # logme 43 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) 44 | 45 | 46 | def parse_dividenden_auf_xyaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 47 | # get basic data 48 | element_counter = 0 49 | origpost, origpost_red, element_counter, content_texts = \ 50 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 51 | 52 | # logme 53 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) 54 | -------------------------------------------------------------------------------- /lib/dictionary_handler.py: -------------------------------------------------------------------------------- 1 | from akf_corelib.conditional_print import ConditionalPrint 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | from .data_helper import DataHelper as dh 4 | 5 | import regex 6 | import json 7 | import os 8 | 9 | 10 | class DictionaryHandler(object): 11 | 12 | def __init__(self): 13 | config_handler = ConfigurationHandler(first_init=False) 14 | 15 | self.config = config_handler.get_config() 16 | self.cpr = ConditionalPrint(self.config.PRINT_DICTIONARY_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, 17 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) 18 | 19 | self.cpr.print("init dictionary handler") 20 | self.data_functs = None # storage for json object 21 | self.data_titles = None # storage for json object 22 | self.texts_functs = None 23 | self.texts_titles = None 24 | if self.config.USE_DICTIONARIES_FOR_PERSON_PARSING: 25 | self.load_dictionaries() 26 | # get the rows as sorted list of texts longest first 27 | if self.data_functs is not None: 28 | check_tf = self.sort_rows(self.get_rows(self.data_functs)) 29 | self.texts_functs = check_tf 30 | if self.data_titles is not None: 31 | check_tt = self.sort_rows(self.get_rows(self.data_titles)) 32 | self.texts_titles = check_tt 33 | 34 | def diff_name_title(self, text_to_check): 35 | 36 | len_text_to_check = len(text_to_check) 37 | name_found = text_to_check 38 | title_found = "" 39 | 40 | for entry_index, entry in enumerate(self.texts_titles): 41 | title, tlen = entry 42 | # accelerate the process, by skipping comparisons which have longer texts 43 | if tlen > len_text_to_check: 44 | continue 45 | # compare the texts 46 | if title in text_to_check: 47 | name_found = text_to_check.replace(title, "", 1).strip() 48 | title_found = title 49 | break 50 | 51 | 52 | return name_found, title_found 53 | 54 | def load_dictionaries(self): 55 | base_dict_path = self.get_dict_path() 56 | 57 | filepath_titles_dict = os.path.join(base_dict_path, "dict_titles.json") 58 | filepath_functs_dict = os.path.join(base_dict_path, "dict_functs.json") 59 | 60 | # load titles 61 | if os.path.exists(filepath_titles_dict): 62 | with open(filepath_titles_dict) as f: 63 | self.data_titles = json.load(f) 64 | else: 65 | self.cpr.printex("dictionary dict_titles.json missing at specificied path",filepath_titles_dict) 66 | 67 | # load functs 68 | if os.path.exists(filepath_functs_dict): 69 | with open(filepath_functs_dict) as f: 70 | self.data_functs = json.load(f) 71 | else: 72 | self.cpr.printex("dictionary dict_functs.json missing at specificied path",filepath_functs_dict) 73 | 74 | 75 | def get_rows(self, dict_data): 76 | rows = dict_data['rows'] 77 | final_rows = [] 78 | for entry in rows: 79 | text = entry[0] 80 | final_rows.append((text,len(text))) 81 | return final_rows 82 | 83 | def sort_rows(self, rows): 84 | #itemgetter(1), 85 | rows.sort(key=lambda t: len(t[0]), reverse=True) 86 | return rows 87 | 88 | def path(self): 89 | return os.getcwd() 90 | 91 | def get_dict_path(self): 92 | complete = os.path.join(self.path(),"additionals","dictionaries") 93 | return complete -------------------------------------------------------------------------------- /lib/segment.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | 5 | class Segment(object): 6 | """ 7 | Root segment class for a classification segments, 8 | child specialized sgments are stored in SegmentHolder 9 | class. 10 | """ 11 | __metaclass__ = abc.ABCMeta 12 | 13 | def __init__(self, segment_tag): 14 | self.start_was_segmented = False 15 | self.stop_was_segmented = False 16 | self.start_error_number = 0 17 | self.stop_error_number = 0 18 | 19 | self.enabled = True 20 | self.only = False 21 | self.start_line_index = -1 22 | self.stop_line_index = -1 23 | self.key_tag_cindex_start = -1 # character index of keytag: 'Vorstand: Name' ---> 0 24 | self.key_tag_cindex_stop = -1 # character index of keytag: 'Vorstand: Name' ---> 9 25 | self.restcontent_in_start_line = -1 26 | self.segment_tag = segment_tag 27 | self.snippet = None 28 | self.info_handler = None 29 | 30 | def disable(self): 31 | self.enabled = False 32 | 33 | def set_only(self): 34 | self.only = True 35 | 36 | def set_start_error_number(self, start_error_number): 37 | self.start_error_number = start_error_number 38 | 39 | def get_start_error_number(self): 40 | return self.start_error_number 41 | 42 | def set_stop_error_number(self, stop_error_number): 43 | self.stop_error_number = stop_error_number 44 | 45 | def get_stop_error_number(self): 46 | return self.stop_error_number 47 | 48 | def get_start_line_index(self): 49 | return self.start_line_index 50 | 51 | def get_stop_line_index(self): 52 | return self.stop_line_index 53 | 54 | def get_segment_tag(self): 55 | return self.segment_tag 56 | 57 | def do_match_work(self, start_or_stop, match, line_index, match_errors): 58 | if start_or_stop is True: # it's a start match 59 | self.set_keytag_indices(match) # this separates keytag from rest of line 60 | self.start_line_index = line_index 61 | self.start_was_segmented = True 62 | self.set_start_error_number(match_errors) 63 | else: 64 | self.stop_line_index = line_index 65 | self.stop_was_segmented = True 66 | self.set_stop_error_number(match_errors) 67 | 68 | @abc.abstractmethod 69 | def match_start_condition(self, line, line_text, line_index, features, num_lines, prev_line, combined_texts): 70 | return # 0 # return number 0 for indication undefined, don't return this in overwritten conditions 71 | 72 | @abc.abstractmethod 73 | def match_stop_condition(self, line, line_text, line_index, features, num_lines, prev_line, combined_texts): 74 | # by default don't assign any stop condition, leave at initial value 75 | # self.stop_line_index = self.start_line_index 76 | return # 0 # return number 0 for indication undefined, don't return this in overwritten conditions 77 | 78 | def start_or_stop_segmented(self): 79 | if self.start_was_segmented or self.stop_was_segmented: 80 | return True 81 | else: 82 | return False 83 | 84 | def is_start_segmented(self): 85 | return self.start_was_segmented 86 | 87 | def is_stop_segmented(self): 88 | return self.stop_was_segmented 89 | 90 | def set_stop_segmented(self, stop_index): 91 | self.stop_line_index = stop_index 92 | self.stop_was_segmented = True 93 | 94 | def set_start_segmented(self, start_index): 95 | self.start_line_index = start_index 96 | self.start_was_segmented = True 97 | 98 | def set_keytag_indices(self, match): 99 | """ 100 | From regex match set the keytag indices, takes 1st occurence, 101 | also checks if there is restcontent besides the match in the 102 | line to check 103 | :param match: regex match 104 | :return: 105 | """ 106 | start_m = match.regs[0][0] 107 | stop_m = match.regs[0][1] 108 | 109 | self.key_tag_cindex_start = start_m 110 | self.key_tag_cindex_stop = stop_m 111 | len_match = stop_m-start_m 112 | len_rest = len(match.string)-len_match 113 | if len_rest > 0: 114 | self.restcontent_in_start_line = len_rest -------------------------------------------------------------------------------- /additionals/dictionaries/dictionary_income.json: -------------------------------------------------------------------------------- 1 | { 2 | "Zusatz": 3 | { 4 | "darunter": "", 5 | "Sonstiges": "", 6 | "Sonst.": "", 7 | "Langfristige": "", 8 | "Langfr." : "", 9 | "Durchlaufende": "", 10 | "dauernde": "" 11 | }, 12 | "Hauptpunkte": 13 | { 14 | "rechnungen Löhne und Gehälter": "Löhne und Gehälter", 15 | "Löhne und Gehälter": "Löhne und Gehälter", 16 | "Abschreibungen u. Werberichtigung": "Abschreibungen und Werberichtigung", 17 | "Abschreibungen": "Abschreibungen", 18 | "Jahresrohertrag": "Jahresrohertrag", 19 | "Jahresertrag": "Jahresertrag", 20 | "Steuern": "Steuern", 21 | "Beteiligungserträge": "Beteiligungserträge", 22 | "Gehälter": "Gehälter", 23 | "Ausweispfl. Steuern": "Ausweispfl. Steuern", 24 | "Personalaufwendungen": "Personalaufwendungen", 25 | "Abschreibungen auf Anlagen": "Abschreibungen auf Anlagen", 26 | "auf Anlagen": "Abschreibungen auf Anlagen", 27 | "Steuern u. ähnl. Abgaben": "Steuern und ähnl. Abgaben", 28 | "Steuern und ähnl. Abgaben": "Steuern und ähnl. Abgaben", 29 | "Dividenden aus Beteiligungen":"Dividenden aus Beteiligungen" , 30 | "Zinsen u. Diskonterträge": "Zinsen u. Diskonterträge", 31 | "Provisionen, Gebühren u.ähnl. Erträge": "Provisionen, Gebühren u.ähnl. Erträge", 32 | "Zins und Diskonterträge": "Zins und Diskonterträge", 33 | "Zinsaufwendungen": "Zinsaufwendungen", 34 | "Zins und Provisionserträge": "Zins und Provisionserträge", 35 | "Betriebsergebnisse": "Betriebsergebnisse", 36 | "Verwaltungskosten": "Verwaltungskosten", 37 | "Steuern und öffentliche Abgaben": "Steuern und öffentliche Abgaben", 38 | "Zahlungen für Versicherungsfälle": "Zahlungen für Versicherungsfälle", 39 | "Beitragseinnahmen": "Beitragseinnahmen", 40 | "Erträge aus Beteiligungen": "Erträge aus Beteiligungen", 41 | "Steuern u.öffentliche Abgaben": "Steuern und öffentliche Abgaben", 42 | "Beteiligungsertrag": "Beteiligungsertrag", 43 | "Rückversicherungsbeiträge": "Rückversicherungsbeiträge", 44 | "Verschiedene Unkosten": "Verschiedene Unkosten", 45 | "Allgemeine Unkosten": "Allgemeine Unkosten", 46 | "Zinserträge": "Zinserträge", 47 | "Ertragssteuern":"Ertragssteuern", 48 | "Abschreibungen und Wertberichtigungen":"Abschreibungen und Wertberichtigungen", 49 | "Wertberichtigung":"Wertberichtigung", 50 | "Gebühren und ähnl. Erträge": "Gebühren und ähnl. Erträge", 51 | "Zinsen und Diskonterträge": "Zinsen und Diskonterträge", 52 | "Allg. Unkosten, Gehälter u. Steuern": "Allg. Unkosten, Gehälter u. Steuern", 53 | "Provisionen und sonst. Erträge": "Provisionen und sonst. Erträge", 54 | "Verschiedene Einnahmen": "Verschiedene Einnahmen", 55 | "Verwaltungsunkosten": "Verwaltungsunkosten", 56 | "Warenkonto": "Warenkonto", 57 | "ähnl. einmalige Erträge": "ähnl. einmalige Erträge", 58 | "Gehälter u. Pensionen": "Gehälter u. Pensionen", 59 | "Zins u. Prov. Einn.": "Zins u. Prov. Einn.", 60 | "Provisionen, Gebühren und ähnl.Erträge": "Provisionen, Gebühren und ähnl.Erträge", 61 | "Anlagevermögen und anderes": "Anlagevermögen und anderes", 62 | "Steuern und soziale Aufwendungen": "Steuern und soziale Aufwendungen", 63 | "sehr.u.aufgenommene Darlehen": "sehr.u.aufgenommene Darlehen", 64 | "Darlehensprovisionen etc.": "Darlehensprovisionen etc.", 65 | "Erstattung der Liquidationskosten": "Erstattung der Liquidationskosten", 66 | "Steuern und Umlagen": "Steuern und Umlagen", 67 | "Provisionen u. sonstige Erträge": "Provisionen und sonst. Erträge", 68 | "Zins u. ähnliche Erträge": "Zins u. ähnliche Erträge", 69 | "Jahreseinnahmen": "Jahreseinnahmen", 70 | "Gehälter und Pensionen": "Gehälter und Pensionen", 71 | "Rohüberschuss": "Rohüberschuss", 72 | "Miet und Pachterträge": "Miet und Pachterträge", 73 | "Nettoverkaufserlös": "Nettoverkaufserlös", 74 | "EEV-Steuern u. LAG": "EEV-Steuern u. LAG", 75 | "Jahresüberschuß": "Jahresüberschuß", 76 | "Jahresfehlbetrag": "Jahresfehlbetrag", 77 | "Herstellungsaufwand":"Herstellungsaufwand", 78 | "Sachaufwand":"Sachaufwand", 79 | "Einmalige Aufwendungen":"Einmalige Aufwendungen", 80 | "Gehälter, Löhne u. Sozialabgaben":"Gehälter, Löhne u. Sozialabgaben", 81 | "Überschuß des Geschäftsjahres":"Überschuß des Geschäftsjahres", 82 | "EEV-Steuern": "EEV-Steuern", 83 | "Besitzsteuern": "Besitzsteuern", 84 | "Gewinnabführung": "Gewinnabführung", 85 | "Materialaufwand, Fremdleistung": "Materialaufwand und Fremdleistung", 86 | "Materialaufwand": "Materialaufwand", 87 | "Verkaufs und allgemeine Unkosten":"Verkaufs und allgemeine Unkosten", 88 | "Vermögensfreigaben im Ausland":"Vermögensfreigaben im Ausland", 89 | "Personalaufwand": "Personalaufwand", 90 | "Betriebsgewinnanteil": "Betriebsgewinnanteil", 91 | "Betriebsausgaben": "Betriebsausgaben", 92 | "Leistungen für Versicherungsfälle": "Leistungen für Versicherungsfälle", 93 | "Erträge des eigenen Verkehrsbetriebes": "Erträge des eigenen Verkehrsbetriebes", 94 | "Produktions u. Verwaltungskosten": "Produktions u. Verwaltungskosten", 95 | "Wertberichtigungen u. Rückstellungen": "Wertberichtigungen u. Rückstellungen", 96 | "Personalaufwendungen ohne Sozialleistungen": "Personalaufwendungen ohne Sozialleistungen", 97 | "Gebührenrohüberschuss": "Gebührenrohüberschuss" 98 | }, 99 | "Unterpunkte": {} 100 | } -------------------------------------------------------------------------------- /lib/additional_info_handler.py: -------------------------------------------------------------------------------- 1 | from akf_corelib.conditional_print import ConditionalPrint 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | 4 | 5 | import json 6 | import glob 7 | import pandas as pd 8 | from os import path 9 | 10 | 11 | 12 | class AdditionalInfoHandler(object): 13 | 14 | def __init__(self): 15 | config_handler = ConfigurationHandler(first_init=False) 16 | 17 | self.config = config_handler.get_config() 18 | self.cpr = ConditionalPrint(self.config.PRINT_ADDITIONAL_INFO_HANDLER, self.config.PRINT_EXCEPTION_LEVEL, 19 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) 20 | self.cpr.print("init additional info handler") 21 | 22 | 23 | def write_excel_to_json(self, fileinfo,filepath,filetype,idxcol=None,parse_cols=None,page=0): 24 | """" 25 | At the moment a little helper script for the Aktienführer-Project. 26 | Be free to modify as you wish. 27 | """ 28 | #if isinstance(parse_cols, list): parse_cols = [parse_cols], 29 | additional_filepath = path.normpath(f"{filepath}/**/*{fileinfo.dbname}.{filetype}") 30 | file = glob.glob(additional_filepath,recursive=True) 31 | if len(file)!= 1: return None 32 | if filetype in ["xlsx","xls"]: 33 | df = pd.read_excel(file[0]).set_index("ProfileID") 34 | jsondata = {fileinfo.dbname:{"Year":fileinfo.dbname}} 35 | jsondf = df.to_dict(orient="index") 36 | jsondata.update(jsondf) 37 | with open(file[0].replace("xlsx","json"),"w") as output: 38 | json.dump(jsondata, output,indent=4) 39 | return None 40 | 41 | def fetch_additional_information_simple(self, file): 42 | """ 43 | Same as fetch additional information, but config related info is already included in given 44 | parameters 45 | :return: additional info 46 | """ 47 | if self.config.ADDITIONAL_INFORMATION: 48 | additional_info = self.fetch_additional_information(file, self.config.INPUT_ADDINFOPATH, 49 | idxcol= self.config.IDXCOL,parse_cols=self.config.PARSE_COLS, 50 | filetype =self.config.INPUT_ADDINFOFILETPYE) 51 | return additional_info 52 | 53 | return None 54 | 55 | def fetch_additional_information(self, fileinfo, filepath, filetype, idxcol=None, parse_cols=None, page=0): 56 | """ 57 | Reads an additional file with information 58 | It searches the file where the index_name matches tablename or dbname 59 | :param file: 60 | :param index_name: 61 | :return: additional info 62 | """ 63 | #if isinstance(parse_cols, list): parse_cols = [parse_cols] 64 | additional_filepath = path.normpath(f"{filepath}/**/*{fileinfo.dbname}.{filetype}") 65 | file = glob.glob(additional_filepath,recursive=True) 66 | 67 | len_files = len(file) 68 | if len_files > 1: 69 | self.cpr.printex("More than one additional information file was found!") 70 | return None 71 | if len_files == 0: 72 | self.cpr.printex("No additional information file was found!") 73 | return None 74 | 75 | file = file[0] 76 | current_db_and_table = {"db": fileinfo.dbname, "table": fileinfo.tablename} 77 | if filetype in ["xlsx","xls"]: 78 | infos = {} 79 | info_df = pd.read_excel(file)#.set_index("ProfileID") 80 | parse_cols.remove(idxcol) 81 | for db_and_table_id, current_db_and_tablename in current_db_and_table.items(): 82 | infos[db_and_table_id] = {} 83 | for line, rubric_content in info_df.loc[info_df[idxcol]==current_db_and_tablename][parse_cols].to_dict(orient="index").items(): 84 | for rubric, content in rubric_content.items(): 85 | if rubric != idxcol: 86 | if infos[db_and_table_id].get(rubric,None) is None: 87 | infos[db_and_table_id][rubric] = content 88 | elif infos[db_and_table_id].get(rubric,None) != content: 89 | if not isinstance(infos[db_and_table_id][rubric], list): infos[db_and_table_id][rubric] = [infos[db_and_table_id][rubric]] 90 | infos[db_and_table_id][rubric].append(content) 91 | elif filetype == "json": 92 | with open(file, "r") as add_info_file: 93 | infos = json.load(add_info_file) 94 | 95 | for possible_db_or_tablenames in reversed(list(infos.keys())): 96 | possible_db_or_tablenames_orig = possible_db_or_tablenames # unchanged name 97 | 98 | if self.config.ADD_INFO_SIMPLIFIED_NAME_COMPARISON: 99 | psplit = possible_db_or_tablenames.split("-") 100 | possible_db_or_tablenames = psplit[0] 101 | 102 | if possible_db_or_tablenames not in current_db_and_table['table']: 103 | del infos[possible_db_or_tablenames_orig] 104 | else: 105 | for db_and_table_id, current_db_and_tablename in current_db_and_table.items(): 106 | if possible_db_or_tablenames == current_db_and_tablename: 107 | infos[db_and_table_id] = infos[possible_db_or_tablenames_orig] 108 | del infos[possible_db_or_tablenames_orig] 109 | else: 110 | return None 111 | return infos -------------------------------------------------------------------------------- /lib/snippet_ocr.py: -------------------------------------------------------------------------------- 1 | from os import path, makedirs 2 | import numpy as np 3 | from PIL import Image 4 | from tesserocr import PyTessBaseAPI, RIL, iterate_level 5 | 6 | class Snippet(object): 7 | """ This library works with bbox on the original image - 8 | - Snip the bbox out of the image 9 | - OCR the snippet with tesseract gives text and bbox per word and confidences per char 10 | - Store the snippet """ 11 | 12 | def __init__(self): 13 | self.bbox = None 14 | self.imgpath = None 15 | self.imgname = None 16 | self.ftype = None 17 | self.fname = None 18 | self.img = None 19 | self.shape = None 20 | self.snippet = None 21 | self.result = None 22 | self.__ocr_settings = {"lang":"akf3","psm":6,"oem":3} 23 | 24 | 25 | def imread(self, imgpath): 26 | """Loads the image with PIL-Lib""" 27 | try: 28 | self.imgpath = imgpath 29 | self.imgname = path.basename(imgpath) 30 | self.ftype = self.imgname.split(".")[-1] 31 | if self.ftype.lower() not in ["jpg", "png", "bmp", "gif", "tiff"]: 32 | raise NameError 33 | self.img = Image.open(f"{self.imgpath}") 34 | self.snippet = self.img 35 | self.shape = list(self.img.tile[0][1]) #[:2]+self.img.tile[0][1][4:1:-1]) 36 | self.bbox = self.shape 37 | except IOError: 38 | print(f"cannot open {self.imgpath}") 39 | except NameError: 40 | print(f"The image filetype {self.ftype} is not supported!") 41 | return True 42 | 43 | def save(self, snippetpath:str): 44 | """Saves the snippet""" 45 | try: 46 | if self.imgname is None: 47 | raise NameError 48 | if not path.exists(snippetpath): 49 | makedirs(snippetpath) 50 | bboxstr = "_".join(str(bboxval) for bboxval in self.bbox) 51 | self.fname = snippetpath + self.imgname.split(".")[0] + "_bbox_" + bboxstr + "." + ".".join(self.imgname.split(".")[1:]) 52 | self.snippet.save(self.fname) 53 | except NameError: 54 | print("Please load an image first.") 55 | except Exception as E: 56 | print(f"{self.fname} could not be stored:{E}") 57 | return True 58 | 59 | def crop(self, bbox:list): 60 | """Snip the bboxarea out of the image""" 61 | try: 62 | if self.img is None: 63 | raise NameError 64 | if any(np.less(bbox[:2],self.shape[:2])) or any(np.greater(bbox[2:4],self.shape[2:4])): 65 | raise ValueError 66 | if not isinstance(bbox,list) or len(bbox) != 4: 67 | raise TypeError 68 | if bbox != self.bbox: 69 | self.bbox = bbox[:] 70 | self.snippet = self.img.crop(self.bbox) 71 | except TypeError: 72 | print("The bbox has not the right type or format.") 73 | except NameError: 74 | print("Please load an image first.") 75 | except ValueError as E: 76 | print(f"The bbox shape doesnt match the image shape. {E}") 77 | except Exception as E: 78 | print(E) 79 | else: 80 | return True 81 | return False 82 | 83 | @property 84 | def ocr_settings(self): 85 | return self.__ocr_settings 86 | 87 | @ocr_settings.setter 88 | def ocr_settings(self, lang=None,psm=None,oem=None): 89 | """Set the parameter from tesseracts""" 90 | if lang is not None: 91 | self.__ocr_settings["lang"] = lang 92 | if psm is not None: 93 | self.__ocr_settings["psm"] = psm 94 | if oem is not None: 95 | self.__ocr_settings["oem"] = oem 96 | return 97 | 98 | def to_text(self): 99 | """Performs tesseract on the snippet""" 100 | try: 101 | if self.bbox is None: 102 | raise ValueError 103 | with PyTessBaseAPI(**self.ocr_settings) as api: 104 | api.SetImage(self.snippet) 105 | api.Recognize() 106 | ri = api.GetIterator() 107 | conf = [] 108 | line = -1 109 | self.result=[] 110 | for r in iterate_level(ri, RIL.SYMBOL): 111 | if r.Empty(RIL.TEXTLINE):continue 112 | if r.IsAtBeginningOf(RIL.TEXTLINE): 113 | line += 1 114 | self.result.append({"text":"","words":[],"charconf":[],"bbox":[]}) 115 | self.result[line]["text"] = r.GetUTF8Text(RIL.TEXTLINE) 116 | #print(r.GetUTF8Text(RIL.TEXTLINE)) 117 | if r.IsAtFinalElement(RIL.WORD,RIL.SYMBOL): 118 | self.result[line]["words"].append(r.GetUTF8Text(RIL.WORD)) 119 | self.result[line]["bbox"].append(r.BoundingBoxInternal(RIL.WORD)) 120 | self.result[line]["charconf"].append(conf) 121 | conf = [] 122 | conf.append(r.Confidence(RIL.SYMBOL)) 123 | if conf: 124 | self.result[line]["charconf"].append(conf) 125 | except ValueError: 126 | print("Please first set the bbox value with snip_bbox.") 127 | return True 128 | 129 | @property 130 | def text(self): 131 | if self.result: 132 | text = "" 133 | for line in self.result: 134 | text += line["text"] 135 | return text 136 | else: 137 | return "" 138 | 139 | 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /configuration/config_parse_hocr_jk.conf: -------------------------------------------------------------------------------- 1 | 2 | INPUT_FILETYPES = [hocr, untype] 3 | #INPUT_FILEGLOB = ./AKFII_ocromore_results_local/msa_best/**/*. # local test folder 4 | # INPUT_FILEGLOB = /media/johannes/AKFII/AKF/AKFII_ocromore_results/msa_best/**/*. # this is the hocr-output of ocromore 5 | # INPUT_FILEGLOB = /media/sf_Transfer/testfiles_hocr/**/*. 6 | # INPUT_FILEGLOB = laptopdata/testfiles_hocr/**/*. 7 | INPUT_FILEGLOB = /media/sf_ShareVB/msa_best/all_years/**/*. # jk this is the hocr-output of ocromore 8 | 9 | USE_SNIPPET = True # Use the snippet tool for reocring snippets of the orig image 10 | IMAGE_PATH = /media/sf_ShareVB/ # Storing path 11 | DRAW_SEPARATOR = False # Save tablecutouts with separator drawn 12 | SAVE_SNIPPET= False # Use Toolbbox methods (you have to installed tesseract, tesserocr) 13 | IMGPATH = ./img/ # ./ -> relative to inputpath 14 | OPATH = ./img/snippets/ # ./ -> relative to inputpath 15 | 16 | INPUT_TABLE_DICTIONARY = ./additionals/dictionaries/ # Path to dictionaries 17 | USE_TABLE_DICTIONARY = True # Use to dictionaries to correct, split and find order level 18 | 19 | STORE_OCCURENCES = True # Storing occruencies of itemnames (tables) 20 | OCCURENCES_TABLETYPE = all # Tabletype to store [datatable_income,datatable_balance, all] 21 | 22 | 23 | [Additional informations settings] 24 | ADDITIONAL_INFORMATION = True 25 | INPUT_ADDINFOPATH = /media/sf_ShareVB/many_years_firmprofiles/additional_information/ #Additional information files 26 | #INPUT_ADDINFOPATH = /media/sf_Transfer/additional_information/ #Additional information files 27 | INPUT_ADDINFOFILETPYE = json 28 | IDXCOL = ProfileID # Column name which is matched with the tablenamen 29 | PARSE_COLS = [LABEL,ProfileID] # Columns which should be parsed to the add info 30 | 31 | TABLENAME_POS = 1 # in example '0585_...hocr' 32 | OCR_PROFILE_POS = 3 # in example: 'default' 33 | OCR_POS = 4 # in example: 'tess' 34 | DBPATH_POS = 2 # in example: '1969' 35 | 36 | 37 | OUTPUT_ROOT_PATH = ./output/ 38 | 39 | [Segmentation settings] 40 | ADD_INFO_SIMPLIFIED_NAME_COMPARISON = True # in the additional info handler, simplify the table name comparison 41 | REMATCH_START_CONDITION_UNTIL_ZERO_ERROR = True 42 | MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION = True # do the index matching until the next start tag, or,- if defined, to the next explicitly recognized stop tag, if False only Start Tags are set to Index field 43 | FILTER_UNCATEGORIES_OVERALL = True # filter the tags which are in known_uncategories in the accumulated segmenation report 44 | #todo add multimatch output in logging 45 | [Parsing settings] 46 | ADD_FULLTEXT_ENTRY = True # adds an entry at the start of json which contains the complete text to parse for verification 47 | ADD_ADDITIONAL_INFO = True # adds the additional information to the output file 48 | ADD_INFO_ENTRY_TO_OUTPUT = True # add entries to output, which contain general information about the parsed segment 49 | REMOVE_TAGS_IN_ORIG_DIFF = True # try to remove leading tags from rest in parsed output to original difference 50 | REMOVE_SPACES_IN_ORIGIN_DIFF = True # removes all spaces from rest and comparison values because spaces are often a problem in subtracting the rests 51 | USE_DICTIONARIES_FOR_PERSON_PARSING = True # uses dictionaries for function and title for the parsing and better recognition of persons 52 | 53 | 54 | [Analysis Settings] 55 | LOG_PARSED_SEGMENTED_OUTPUT = True # logs the parsed results in a file for each segmentation tag 56 | LOG_SIMPLE = False # Just simple and fast logging (without tablerecognition) 57 | LOG_PARSED_TO_ORIG_DIFF_PER_CATEGORY = True # logs the difference of parsed result and original segmented output for specific category 58 | LOG_PARSED_TO_ORIG_ADD_OUTPUT_JSON = False # in above logging add the output-json to the diff files 59 | LOG_PARSED_TO_ORIG_DIFF_ACCUMULATED = True # creates an accumulated report for differences from parsed to segmented output for each folder/akf-year 60 | LOG_SEGMENTED_TO_ORIG_DIFF_PER_FILE = True # (needs ADD_FULLTEXT_ENTRY enabled) logs the difference of segmented result and original segmented output for specific file/akf-table 61 | LOG_SEGMENTED_TO_ORIG_ADD_OUTPUT_JSON = True # in above logging add the output-json to the diff files 62 | LOG_SEGMENTED_TO_ORIG_DIFF_ACCUMULATED = True # creates an accumulated report for differences from segmented to original output for each folder/akf-year 63 | JOIN_SEGMENTED_TEXTS_IN_ORIG_DIFF_PER_CATEGORY = True # the segmented texts get joined by algorithm which removes dashes and so on 64 | 65 | 66 | [Print and logging settings] 67 | PRINT_WARNING_LEVEL = True # print warnings except activation in class print settings 68 | PRINT_EXCEPTION_LEVEL = True # print exceptions except activation in class print settings 69 | 70 | PRINT_MAIN = True 71 | PRINT_FEATURE_EXTRACTOR = False 72 | PRINT_ADDITIONAL_INFO_HANDLER = True 73 | PRINT_SEGMENT_CLASSIFIER = True 74 | PRINT_SEGMENT_PARSER = True 75 | PRINT_SEGMENT_PARSER_AKF_FN_ONE = False # print parsing functions related to AKF (File one) 76 | PRINT_SEGMENT_PARSER_AKF_FN_TWO = False # print parsing functions related to AKF (File two) 77 | PRINT_SEGMENT_PARSER_AKF_FN_THREE = True # print parsing functions related to AKF (File three) 78 | PRINT_SEGMENT_PARSER_AKF_FN_TABLES_ONE = True # print parsing functions related to AKF (Table specific one) 79 | PRINT_OUTPUT_ANALYSIS = False 80 | PRINT_DICTIONARY_HANDLER = True # print output related to dictionary handler -------------------------------------------------------------------------------- /configuration/config_parse_hocr_js.conf: -------------------------------------------------------------------------------- 1 | 2 | INPUT_FILETYPES = [hocr, untype] 3 | #INPUT_FILEGLOB = ./AKFII_ocromore_results_local/msa_best/**/*. # local test folder 4 | # INPUT_FILEGLOB = /media/johannes/AKFII/AKF/AKFII_ocromore_results/msa_best/**/*. # this is the hocr-output of ocromore 5 | INPUT_FILEGLOB = /media/sf_Transfer/AKFII_results/**/*. 6 | # INPUT_FILEGLOB = laptopdata/testfiles_hocr/**/*. 7 | # INPUT_FILEGLOB = /media/sf_ShareVB/many_years_firmprofiles_output/AKFII/long/**/*. # jk this is the hocr-output of ocromore 8 | 9 | USE_SNIPPET = True # Use the snippet tool for reocring snippets of the orig image 10 | IMAGE_PATH = /media/sf_ShareVB/ # Storing path 11 | DRAW_SEPARATOR = False # Save tablecutouts with separator drawn 12 | SAVE_SNIPPET= False # Use Toolbbox methods (you have to installed tesseract, tesserocr) 13 | IMGPATH = ./img/ # ./ -> relative to inputpath 14 | OPATH = ./img/snippets/ # ./ -> relative to inputpath 15 | 16 | INPUT_TABLE_DICTIONARY = ./additionals/dictionaries/ # Path to dictionaries 17 | USE_TABLE_DICTIONARY = True # Use to dictionaries to correct, split and find order level 18 | 19 | STORE_OCCURENCES = False # Storing occruencies of itemnames (tables) 20 | OCCURENCES_TABLETYPE = datatable_income # Tabletype to store [datatable_income,datatable_balance] 21 | 22 | [Additional informations settings] 23 | ADDITIONAL_INFORMATION = True 24 | # INPUT_ADDINFOPATH = /media/sf_ShareVB/many_years_firmprofiles/additional_information/ #Additional information files 25 | INPUT_ADDINFOPATH = /media/sf_Transfer/additional_information/ #Additional information files 26 | INPUT_ADDINFOFILETPYE = json 27 | IDXCOL = ProfileID # Column name which is matched with the tablenamen 28 | PARSE_COLS = [LABEL,ProfileID] # Columns which should be parsed to the add info 29 | 30 | TABLENAME_POS = 1 # in example '0585_...hocr' 31 | OCR_PROFILE_POS = 3 # in example: 'default' 32 | OCR_POS = 4 # in example: 'tess' 33 | DBPATH_POS = 2 # in example: '1969' 34 | 35 | 36 | OUTPUT_ROOT_PATH = ./output/ 37 | 38 | [Segmentation settings] 39 | ADD_INFO_SIMPLIFIED_NAME_COMPARISON = True # in the additional info handler, simplify the table name comparison 40 | REMATCH_START_CONDITION_UNTIL_ZERO_ERROR = True 41 | MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION = True # do the index matching until the next start tag, or,- if defined, to the next explicitly recognized stop tag, if False only Start Tags are set to Index field 42 | FILTER_UNCATEGORIES_OVERALL = True # filter the tags which are in known_uncategories in the accumulated segmenation report 43 | #todo add multimatch output in logging 44 | [Parsing settings] 45 | ADD_FULLTEXT_ENTRY = True # adds an entry at the start of json which contains the complete text to parse for verification 46 | ADD_ADDITIONAL_INFO = True # adds the additional information to the output file 47 | ADD_INFO_ENTRY_TO_OUTPUT = True # add entries to output, which contain general information about the parsed segment 48 | REMOVE_TAGS_IN_ORIG_DIFF = True # try to remove leading tags from rest in parsed output to original difference 49 | REMOVE_SPACES_IN_ORIGIN_DIFF = True # removes all spaces from rest and comparison values because spaces are often a problem in subtracting the rests 50 | USE_DICTIONARIES_FOR_PERSON_PARSING = True # uses dictionaries for function and title for the parsing and better recognition of persons 51 | 52 | 53 | [Analysis Settings] 54 | LOG_PARSED_SEGMENTED_OUTPUT = True # logs the parsed results in a file for each segmentation tag 55 | LOG_SIMPLE = True # Just simple and fast logging (without tablerecognition) 56 | LOG_PARSED_TO_ORIG_DIFF_PER_CATEGORY = True # logs the difference of parsed result and original segmented output for specific category 57 | LOG_PARSED_TO_ORIG_ADD_OUTPUT_JSON = False # in above logging add the output-json to the diff files 58 | LOG_PARSED_TO_ORIG_DIFF_ACCUMULATED = True # creates an accumulated report for differences from parsed to segmented output for each folder/akf-year 59 | LOG_SEGMENTED_TO_ORIG_DIFF_PER_FILE = True # (needs ADD_FULLTEXT_ENTRY enabled) logs the difference of segmented result and original segmented output for specific file/akf-table 60 | LOG_SEGMENTED_TO_ORIG_ADD_OUTPUT_JSON = True # in above logging add the output-json to the diff files 61 | LOG_SEGMENTED_TO_ORIG_DIFF_ACCUMULATED = True # creates an accumulated report for differences from segmented to original output for each folder/akf-year 62 | JOIN_SEGMENTED_TEXTS_IN_ORIG_DIFF_PER_CATEGORY = True # the segmented texts get joined by algorithm which removes dashes and so on 63 | 64 | [Print and logging settings] 65 | PRINT_WARNING_LEVEL = True # print warnings except activation in class print settings 66 | PRINT_EXCEPTION_LEVEL = True # print exceptions except activation in class print settings 67 | 68 | PRINT_MAIN = True 69 | PRINT_FEATURE_EXTRACTOR = False 70 | PRINT_ADDITIONAL_INFO_HANDLER = True 71 | PRINT_SEGMENT_CLASSIFIER = True 72 | PRINT_SEGMENT_PARSER = True 73 | PRINT_SEGMENT_PARSER_AKF_FN_ONE = False # print parsing functions related to AKF (File one) 74 | PRINT_SEGMENT_PARSER_AKF_FN_TWO = False # print parsing functions related to AKF (File two) 75 | PRINT_SEGMENT_PARSER_AKF_FN_THREE = True # print parsing functions related to AKF (File three) 76 | PRINT_SEGMENT_PARSER_AKF_FN_TABLES_ONE = True # print parsing functions related to AKF (Table specific one) 77 | PRINT_OUTPUT_ANALYSIS = False 78 | PRINT_DICTIONARY_HANDLER = True # print output related to dictionary handler -------------------------------------------------------------------------------- /additionals/dictionaries/dictionary_balance.json: -------------------------------------------------------------------------------- 1 | { 2 | "Zusatz": 3 | { 4 | "Aktiva":"", 5 | "Passiva": "", 6 | "darunter": "", 7 | "Sonstige": "", 8 | "Sonstiges": "", 9 | "Sonst.": "", 10 | "Langfristige": "", 11 | "Langfr." : "", 12 | "Kurzfristige":"", 13 | "Durchlaufende": "", 14 | "dauernde": "" 15 | }, 16 | "Hauptpunkte":{ 17 | "Eigenkapital": "Eigenkapital", 18 | "Fremdkapital": "Fremdkapital", 19 | "Gewinn nach Vortrag": "Gewinn nach Vortrag", 20 | "Anlagevermögen": "Anlagevermögen", 21 | "Umlaufvermögen": "Umlaufvermögen", 22 | "Verlust ohne Vortrag": "Verlust ohne Vortrag", 23 | "Verlust nach Vortrag": "Verlust nach Vortrag", 24 | "Passiva Einlagen": "Passiva Einlagen", 25 | "Aufgenommene Gelder": "Aufgenommene Gelder", 26 | "Barreserve":"Barreserve", 27 | "Nostroguthaben":"Nostroguthaben", 28 | "Betriebserträge":"Betriebserträge", 29 | "Uraltguthaben": "Uraltguthaben", 30 | "Wertpapiere": "Wertpapiere", 31 | "Konsortialbeteiligungen": "Konsortialbeteiligungen", 32 | "Debitoren": "Debitoren", 33 | "Deckungsforderungen": "Deckungsforderungen", 34 | "Sonstige Aktiva": "Sonstige Aktiva", 35 | "Beteiligungen": "Beteiligungen", 36 | "Ausgleichsforderungen": "Ausgleichsforderungen", 37 | "Ausleihungen": "Ausleihungen", 38 | "Schuldverschreibungen": "Schuldverschreibungen", 39 | "Zinsen hierauf": "Zinsen hierauf", 40 | "Ausstehende Einlagen auf A.-K.": "Ausstehende Einlagen auf A.-K.", 41 | "Rückstellungen": "Rückstellungen", 42 | "Schuldner": "Schuldner", 43 | "Anzahlungen": "Anzahlungen", 44 | "Anlagen": "Anlagen", 45 | "Vorräte": "Vorräte", 46 | "Gläubiger": "Gläubiger", 47 | "Verbindlichkeiten": "Verbindlichkeiten", 48 | "Forderungen": "Forderungen", 49 | "Gewinn ohne Vortrag": "Gewinn ohne Vortrag", 50 | "Kapitalentwertungskonto": "Kapitalentwertungskonto", 51 | "Vermögen": "Vermögen", 52 | "Konzernunternehmen": "Konzernunternehmen", 53 | "Einlagen": "Einlagen", 54 | "Eigene Akzepte und Solawechsel": "Eigene Akzepte und Solawechsel", 55 | "Kapitalausgleichskonto": "Kapitalausgleichskonto", 56 | "Löhne und Gehälter": "Löhne und Gehälter", 57 | "Abschreibungen": "Abschreibungen", 58 | "Steuern": "Steuern", 59 | "Jahresertrag": "Jahresertrag", 60 | "Beteiligungserträge": "Beteiligungserträge", 61 | "Abwicklungsvermögen": "Abwicklungsvermögen", 62 | "Zinsen": "Zinsen", 63 | "Schuldverschreibungen im Umlauf": "Schuldverschreibungen im Umlauf", 64 | "Zinsen v. Ausleihungen": "Zinsen v. Ausleihungen", 65 | "Kapitalverlustkonto gemn. DMBG": "Kapitalverlustkonto gemn. DMBG", 66 | "ao. Kapitalentwertungskonto": "Kapitalentwertungsskonto", 67 | "Pensions-Rückstellungen": "Pensions-Rückstellungen", 68 | "Anleihen im Umlauf": "Anleihen im Umlauf", 69 | "Grundstücke und Gebäude": "Grundstücke und Gebäude", 70 | "Hypotheken u. Kommunaldarlehen": "Hypotheken u. Kommunaldarlehen", 71 | "Aufgenommene Darlehen": "Aufgenommene Darlehen", 72 | "Anlagewerte": "Anlagewerte", 73 | "Rechnungsabgrenzung": "Rechnungsabgrenzung", 74 | "Liquidationskapital": "Liquidationskapital", 75 | "Ao.Kap.-Entw.Konto": "Ao.Kap.-Entw.Konto", 76 | "Hypotheken und Darlehen": "Hypotheken und Darlehen", 77 | "Kommunaldarlehen": "Kommunaldarlehen", 78 | "Aufgenommene langfr.Darlehen": "Aufgenommen langfr. Darlehen", 79 | "Grundkapital u. ges.Rücklage": "Grundkapital u. ges. Rücklage", 80 | "Bilanzsumme": "Bilanzsumme", 81 | "Guthaben bei Kreditinstituten": "Guthaben bei Kreditinstituten", 82 | "Kredite": "Kredite", 83 | "langfristige Darlehen u.Anleihen": "langfristige Darlehen u.Anleihen", 84 | "lagen auf das Grundkapital": "Ausstehende Einlagen auf das Grundkapital", 85 | "Abschreibungen auf Anlagen":"Abschreibungen auf Anlagen", 86 | "Technische Rückstellungen":"Technische Rückstellungen", 87 | "Allgemeine Rückstellungen":"Allgemeine Rückstellungen", 88 | "Kapital": "Kapital", 89 | "Rücklagen": "Rücklagen", 90 | "Reingewinn": "Reingewinn", 91 | "Eigenmittel": "Eigenmittel", 92 | "Bilanzgewinn": "Bilanzgewinn", 93 | "Hauptpunkte":"Hauptpunkte", 94 | "Restliche Passiva":"Restliche Passiva", 95 | "Spareinlagen":"Spareinlagen", 96 | "Bilanzverlust":"Bilanzverlust", 97 | "Forderungen an Kreditinstitute":"Forderungen an Kreditinstitute", 98 | "Wertberichtigungen":"Wertberichtigungen", 99 | "Ausstehende Einlagen auf das Grundkapital":"Ausstehende Einlagen auf das Grundkapital", 100 | "Rückstellung für LAG-Vermögensabgabe": "Rückstellungen", 101 | "Rückstellung für Pensionsverpfl.": "Rückstellungen", 102 | "Gewinn einschl. Vortrag": "Gewinn nach Vortrag", 103 | "Entwertungskonto": "Entwertungskonto", 104 | "Kapitalverlustkonto": "Kapitalverlustkonto", 105 | "Bankguthaben": "Bankguthaben", 106 | "Vermögensunterdeckung":"Vermögensunterdeckung", 107 | "Versicherungstechnische Rückstellungen":"Versicherungstechnische Rückstellungen", 108 | "Wertpapieranlagen": "Wertpapieranlagen", 109 | "Nichtversicherungstechnische Rückstellungen":"Nichtversicherungstechnische Rückstellungen", 110 | "Grunkapital":"Grundkapital", 111 | "Einbehaltene Gewinne":"Einbehaltene Gewinne", 112 | "ohne Vortrag": "ohne Vortrag" 113 | }, 114 | "Unterpunkte":{ 115 | "davon A.-K.": "davon A.-K.", 116 | "davon AK": "davon A.-K.", 117 | "Vorräte": "Vorräte", 118 | "Lieferforderungen": "Lieferforderungen", 119 | "Barmittel": "Barmittel", 120 | "Barmittel einschl. Wertpapiere": "Barmittel", 121 | "Flüssige Mittel": "Flüssige Mittel", 122 | "Beteiligungen": "Beteiligungen", 123 | "Grundkapital": "Grundkapital", 124 | "Aktien und Beteiligungen": "Aktien und Beteiligungen", 125 | "Betriebsstoffe und Waren": "Betriebsstoffe und Waren", 126 | "Forderungen aus Mieten": "Forderungen aus Mieten" 127 | } 128 | } -------------------------------------------------------------------------------- /main_start.py: -------------------------------------------------------------------------------- 1 | # custom imports 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | from akf_corelib.database_handler import DatabaseHandler 4 | from akf_corelib.conditional_print import ConditionalPrint 5 | from lib.dictionary_handler import DictionaryHandler 6 | from lib.feature_extractor import FeatureExtractor 7 | from lib.segment_classifier import SegmentClassifier 8 | from lib.segment_parser import SegmentParser 9 | from lib.output_analysis import OutputAnalysis 10 | from lib.additional_info_handler import AdditionalInfoHandler 11 | 12 | # load configuration and printer 13 | CODED_CONFIGURATION_PATH = './configuration/config_parse_hocr_js.conf' 14 | config_handler = ConfigurationHandler(first_init=True, fill_unkown_args=True, \ 15 | coded_configuration_paths=[CODED_CONFIGURATION_PATH]) 16 | config = config_handler.get_config() 17 | cpr = ConditionalPrint(config.PRINT_MAIN, config.PRINT_EXCEPTION_LEVEL, 18 | config.PRINT_WARNING_LEVEL, leading_tag="main_start") 19 | 20 | # Basic steps: 21 | feature_extractor = FeatureExtractor() 22 | add_info_handler = AdditionalInfoHandler() 23 | dictionary_handler = DictionaryHandler() 24 | segment_classifier = SegmentClassifier() 25 | output_analyzer = OutputAnalysis() 26 | segment_parser = SegmentParser(output_analyzer, dictionary_handler) 27 | 28 | 29 | dh = DatabaseHandler(dbdir="") 30 | dh.set_dirpos(tablename_pos=config.TABLENAME_POS,ocr_profile_pos=config.OCR_PROFILE_POS,\ 31 | ocr_pos=config.OCR_POS, dbname_pos=config.DBPATH_POS) 32 | 33 | dh.fetch_files(config.INPUT_FILEGLOB, config.INPUT_FILETYPES) 34 | # get files-list 35 | hocr_files = dh.get_files() 36 | 37 | accumulated_tags = {} 38 | 39 | # main iteration loop 40 | for key in hocr_files: 41 | #if "1956" not in key: 42 | # continue 43 | int_key = int(key) 44 | if int_key < 1973 or int_key > 1973: # start from 1971 45 | continue 46 | 47 | accumulated_diff_info = output_analyzer.AccumulatedInfo() 48 | accumulated_diff_info_categories = {} 49 | accumulated_diff_info_orig_to_segment = {} 50 | 51 | ocromore_data = None 52 | ctr_test = 1 53 | 54 | my_list = hocr_files[key] 55 | for file in my_list: 56 | #if "msa_best" not in file.ocr_profile: 57 | # continue 58 | 59 | # only check files which are relevant (comment out if not used) 60 | # Sitz ok: 72, 207,671, 731, 733 61 | # Sitz faulty: 270,454 62 | if ctr_test not in [151]: 63 | ctr_test += 1 64 | continue 65 | 66 | #split = file.name.split("_") 67 | #if int(split[1]) < 1968: 68 | # continue 69 | #if int(split[0])<300: 70 | # continue 71 | #if not "_1956" in file.name: 72 | # continue 73 | # fetch additional information for current file (if toggled in info) 74 | additional_info = add_info_handler.fetch_additional_information_simple(file) 75 | 76 | # fetch basic data for current file 77 | ocromore_data = dh.fetch_ocromore_data(file, additional_info=additional_info) 78 | output_analyzer.set_current_data(ocromore_data) # prepare output analyzer 79 | 80 | cpr.print("Checking file:", ocromore_data['file_info'].path) 81 | 82 | # extract features from basic data 83 | ocromore_data = feature_extractor.extract_file_features(ocromore_data) 84 | # line segmentation 85 | ocromore_data = segment_classifier.classify_file_segments(ocromore_data) 86 | # segment parsing 87 | ocromore_data = segment_parser.parse_segments(ocromore_data) 88 | # output file synthesis 89 | segment_parser.write_result_to_output(True, ocromore_data) 90 | # todo 91 | # output analysis steps 92 | output_analyzer.log_segmentation_simple(ocromore_data) # log the recognized segmentation 93 | output_analyzer.log_parsed_output(ocromore_data) # log the parsed segments into tag-based files 94 | diff_info_orig_to_segment = output_analyzer.log_original_to_segment_diff(ocromore_data, use_delimiters=False) # log the difference of segmented data to original data 95 | diff_info_categories = output_analyzer.log_segmentation_diff_orig_to_parsed_output(ocromore_data) # log the segmentation 96 | diff_info = output_analyzer.log_unsegmentated(ocromore_data) 97 | accumulated_diff_info_categories = \ 98 | output_analyzer.accumulate_diff_info_output_to_orig(diff_info_categories, accumulated_diff_info_categories) 99 | accumulated_diff_info_orig_to_segment = \ 100 | output_analyzer.accumulate_diff_info_orig_to_segmentation(diff_info_orig_to_segment, accumulated_diff_info_orig_to_segment) 101 | 102 | accumulated_diff_info = output_analyzer.accumulate_diff_info(ocromore_data, diff_info, accumulated_diff_info) 103 | accumulated_tags = output_analyzer.log_tags(ocromore_data, accumulated_tags) 104 | ctr_test += 1 105 | 106 | if ctr_test >= 30: 107 | break 108 | 109 | # clear the current result in segment_parser cache to parse the next one 110 | segment_parser.clear_result(output_analyzer, dictionary_handler) 111 | 112 | # output analysis: 113 | # print diff info for this year (accumulated over all tables/year) 114 | output_analyzer.log_accumulated_unsegmentated(accumulated_diff_info, ocromore_data) 115 | # print the amount of chars which is left for each category after parsing for this year 116 | output_analyzer.log_accumulated_orig_to_parsed_output(accumulated_diff_info_categories, ocromore_data) 117 | # print diff info for this year between original and segmentation 118 | output_analyzer.log_accumulated_orig_to_segment(accumulated_diff_info_orig_to_segment, ocromore_data) 119 | 120 | 121 | output_analyzer.log_accumulated_tags(accumulated_tags) 122 | -------------------------------------------------------------------------------- /parser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Rafa Haro ' 2 | 3 | from abc import ABCMeta, abstractmethod 4 | from bs4 import BeautifulSoup 5 | import re 6 | 7 | 8 | class HOCRElement: 9 | 10 | __metaclass__ = ABCMeta 11 | 12 | COORDINATES_PATTERN = re.compile("bbox\s(-?[0-9]+)\s(-?[0-9]+)\s(-?[0-9]+)\s(-?[0-9]+)") 13 | 14 | def __init__(self, hocr_html, parent, next_tag, next_attribute, next_class): 15 | self.__coordinates = (0, 0, 0, 0) 16 | self._hocr_html = hocr_html 17 | self._id = None 18 | self._parent = parent 19 | self._elements = self._parse(next_tag, next_attribute, next_class) 20 | 21 | def _parse(self, next_tag, next_attributte, next_class): 22 | 23 | try: 24 | self._id = self._hocr_html['id'] 25 | except KeyError: 26 | self._id = None 27 | 28 | try: 29 | title = self._hocr_html['title'] 30 | match = HOCRElement.COORDINATES_PATTERN.search(title) 31 | if match: 32 | self.__coordinates = (int(match.group(1)), 33 | int(match.group(2)), 34 | int(match.group(3)), 35 | int(match.group(4))) 36 | else: 37 | raise ValueError("The HOCR element doesn't contain a valid title property") 38 | except KeyError: 39 | self.__coordinates = (0, 0, 0, 0) 40 | 41 | elements = [] 42 | if next_tag is not None and next_class is not None: 43 | for html_element in self._hocr_html.find_all(next_tag, {'class':next_attributte}): 44 | elements.append(next_class(self, html_element)) 45 | return elements 46 | 47 | @property 48 | def coordinates(self): 49 | return self.__coordinates 50 | 51 | @property 52 | def html(self): 53 | return self._hocr_html.prettify() 54 | 55 | @property 56 | def id(self): 57 | return self._id 58 | 59 | @property 60 | def parent(self): 61 | return self._parent 62 | 63 | def __hash__(self): 64 | return hash(self._id) 65 | 66 | def __eq__(self, other): 67 | if not isinstance(other, HOCRElement): 68 | return False 69 | else: 70 | return self._id == other._id 71 | 72 | @property 73 | @abstractmethod 74 | def ocr_text(self): 75 | pass 76 | 77 | class HOCRDocument(HOCRElement): 78 | 79 | def __init__(self, source, is_path=False): 80 | 81 | if not is_path: 82 | hocr_html = BeautifulSoup(source, 'html.parser') 83 | else: 84 | hocr_html = BeautifulSoup(open(source, 'r', encoding="utf-8").read(), 'html.parser') 85 | 86 | super(HOCRDocument, self).__init__(hocr_html, None, 'div', Page.HOCR_PAGE_TAG, Page) 87 | 88 | @property 89 | def ocr_text(self): 90 | output = "" 91 | for element in self._elements[:-1]: 92 | output += element.ocr_text 93 | output += "\n\n" 94 | output += self._elements[-1].ocr_text 95 | return output 96 | 97 | @property 98 | def pages(self): 99 | return self._elements 100 | 101 | @property 102 | def npages(self): 103 | return len(self._elements) 104 | 105 | @property 106 | def ocr(self): 107 | for tag in self._hocr_html.find_all("meta"): 108 | if "esseract" in tag.get("content",None): 109 | return "Tess" 110 | if "cropy" in tag.get("content",None): 111 | return "Ocro" 112 | if "ABBYY" in tag.get("content",None): 113 | return "Abbyy" 114 | return "Abbyy" 115 | 116 | class Page(HOCRElement): 117 | 118 | HOCR_PAGE_TAG = "ocr_page" 119 | 120 | def __init__(self, parent, hocr_html): 121 | super(Page, self).__init__(hocr_html, parent, 'div', Area.HOCR_AREA_TAG, Area) 122 | 123 | @property 124 | def ocr_text(self): 125 | output = "" 126 | for element in self._elements[:-1]: 127 | output += element.ocr_text 128 | output += "\n\n" 129 | output += self._elements[-1].ocr_text 130 | return output 131 | 132 | @property 133 | def areas(self): 134 | return self._elements 135 | 136 | @property 137 | def nareas(self): 138 | return len(self._elements) 139 | 140 | class Area(HOCRElement): 141 | 142 | HOCR_AREA_TAG = "ocr_carea" 143 | 144 | def __init__(self, parent, hocr_html): 145 | super(Area, self).__init__(hocr_html, parent, 'p', Paragraph.HOCR_PAR_TAG, Paragraph) 146 | 147 | @property 148 | def paragraphs(self): 149 | return self._elements 150 | 151 | @property 152 | def nparagraphs(self): 153 | return len(self._elements) 154 | 155 | @property 156 | def ocr_text(self): 157 | output = "" 158 | for element in self._elements[:-1]: 159 | output += element.ocr_text 160 | output += "\n" 161 | output += self._elements[-1].ocr_text 162 | return output 163 | 164 | class Paragraph(HOCRElement): 165 | 166 | HOCR_PAR_TAG = "ocr_par" 167 | 168 | def __init__(self, parent, hocr_html): 169 | super(Paragraph, self).__init__(hocr_html, parent, 'span', Line.HOCR_LINE_TAG, Line) 170 | 171 | @property 172 | def lines(self): 173 | return self._elements 174 | 175 | @property 176 | def nlines(self): 177 | return len(self._elements) 178 | 179 | @property 180 | def ocr_text(self): 181 | output = "" 182 | for element in self._elements[:-1]: 183 | output += element.ocr_text 184 | output += "\n" 185 | output += self._elements[-1].ocr_text 186 | return output 187 | 188 | class Line(HOCRElement): 189 | 190 | HOCR_LINE_TAG = "ocr_line" 191 | 192 | def __init__(self, parent, hocr_html): 193 | super(Line, self).__init__(hocr_html, parent, 'span', Word.HOCR_WORD_TAG, Word) 194 | self._ocr_text_normalized = None # custom property, none if not assigned 195 | 196 | 197 | @property 198 | def words(self): 199 | return self._elements 200 | 201 | @property 202 | def nwords(self): 203 | return len(self._elements) 204 | 205 | @property 206 | def ocr_text(self): 207 | output = "" 208 | for element in self._elements[:-1]: 209 | output += element.ocr_text 210 | output += " " 211 | output += self._elements[-1].ocr_text 212 | return output 213 | 214 | @property 215 | def ocr_text_normalized(self): 216 | return self._ocr_text_normalized 217 | 218 | @ocr_text_normalized.setter 219 | def ocr_text_normalized(self, new_text): 220 | self._ocr_text_normalized = new_text 221 | 222 | class Word(HOCRElement): 223 | 224 | HOCR_WORD_TAG = "ocrx_word" 225 | _xwconf = None 226 | _xconfs = None 227 | 228 | def __init__(self, parent, hocr_html): 229 | super(Word, self).__init__(hocr_html, parent, None, None, None) 230 | title = hocr_html.attrs['title'] 231 | titlesplit = title.split(';') 232 | for element in titlesplit: 233 | if 'x_wconf' in element: 234 | self._xwconf = element.strip().split(' ')[1] 235 | if "x_confs" in element: 236 | self._xconfs = element.strip().split(' ')[1:] 237 | break 238 | 239 | 240 | @property 241 | def ocr_text(self): 242 | word = self._hocr_html.string 243 | if word is not None: 244 | return word 245 | else: 246 | return "" -------------------------------------------------------------------------------- /lib/akf_parsing_functions_jk.py: -------------------------------------------------------------------------------- 1 | from akf_corelib.conditional_print import ConditionalPrint 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | from .akf_parsing_functions_common import AKFCommonParsingFunctions as cf 4 | from lib.table_parser import Datatable, Sharetable, Dividendtable 5 | import time 6 | 7 | def timeit(method): 8 | def timed(*args, **kw): 9 | ts = time.time() 10 | result = method(*args, **kw) 11 | te = time.time() 12 | 13 | if 'log_time' in kw: 14 | name = kw.get('log_name', method.__name__.upper()) 15 | kw['log_time'][name] = int((te - ts) * 1000) 16 | else: 17 | print('%r %2.2f ms' % \ 18 | (method.__name__, (te - ts) * 1000)) 19 | return result 20 | 21 | return timed 22 | 23 | class AkfParsingFunctionsJK(object): 24 | 25 | def __init__(self, endobject_factory, output_analyzer, dictionary_handler, ocromore_data=None): 26 | config_handler = ConfigurationHandler(first_init=False) 27 | 28 | self.config = config_handler.get_config() 29 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_THREE, self.config.PRINT_EXCEPTION_LEVEL, 30 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) 31 | 32 | self.cpr.print("init akf parsing functions three") 33 | 34 | self.ef = endobject_factory 35 | self.output_analyzer = output_analyzer 36 | self.ocromore_data = ocromore_data 37 | self.dictionary_handler = dictionary_handler 38 | 39 | def parse_bilanzen(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 40 | # get basic data 41 | element_counter = 0 42 | origpost, origpost_red, element_counter, content_texts = \ 43 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 44 | 45 | # logme 46 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) 47 | 48 | # init 49 | only_add_if_string = True 50 | if self.config.LOG_SIMPLE: 51 | geschaeftslage = origpost_red.replace("- ", "") 52 | 53 | #parsing 54 | self.ef.add_to_my_obj("balances", geschaeftslage, object_number=element_counter,only_filled=only_add_if_string) 55 | return True 56 | #parsing 57 | table = Datatable(snippet=segmentation_class.snippet) 58 | table.analyse_structure(content_lines,feature_lines, template="datatable_balance") 59 | table.extract_content(content_lines, feature_lines, template="datatable_balance") 60 | 61 | # Write information for income table parsing 62 | segmentation_class.info_handler["income"] = {} 63 | segmentation_class.info_handler["income"]["amount"] = table.info.amount 64 | segmentation_class.info_handler["income"]["col"] = table.info.col 65 | segmentation_class.info_handler["income"]["separator"] = table.info.separator 66 | 67 | # Parsing the tables based on whitespace and number of numbers of each group 68 | # This should be the last option to parse (error-prone) 69 | self.ef.add_to_my_obj("balances", table.content, object_number=element_counter,only_filled=only_add_if_string) 70 | 71 | def parse_gewinn_und_verlust(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 72 | # get basic data 73 | element_counter = 0 74 | origpost, origpost_red, element_counter, content_texts = \ 75 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 76 | 77 | # logme 78 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) 79 | 80 | # init 81 | only_add_if_string = True 82 | if self.config.LOG_SIMPLE: 83 | geschaeftslage = origpost_red.replace("- ", "") 84 | 85 | #parsing 86 | self.ef.add_to_my_obj("income", geschaeftslage, object_number=element_counter,only_filled=only_add_if_string) 87 | return True 88 | 89 | # parsing 90 | table = Datatable(snippet=segmentation_class.snippet) 91 | table.analyse_structure(content_lines, feature_lines, template="datatable_income") 92 | if segmentation_class.info_handler and "income" in set(segmentation_class.info_handler.keys()): 93 | table.info.col = segmentation_class.info_handler["income"]["col"] 94 | table.info.amount = segmentation_class.info_handler["income"]["amount"] 95 | table.info.separator = segmentation_class.info_handler["income"]["separator"] 96 | 97 | table.extract_content(content_lines, feature_lines, template="datatable_income") 98 | 99 | 100 | #parsing 101 | self.ef.add_to_my_obj("income", table.content, object_number=element_counter, 102 | only_filled=only_add_if_string) 103 | 104 | def parse_aktienkurse(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 105 | # get basic data 106 | element_counter = 0 107 | origpost, origpost_red, element_counter, content_texts = \ 108 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 109 | 110 | # logme 111 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) 112 | 113 | # init 114 | only_add_if_string = True 115 | #self.config.LOG_SIMPLE = False 116 | if self.config.LOG_SIMPLE: 117 | # self.config.LOG_SIMPLE = False 118 | skip = origpost_red.replace("- ", "") 119 | 120 | # parsing 121 | self.ef.add_to_my_obj("shares", skip, object_number=element_counter, 122 | only_filled=only_add_if_string) 123 | return True 124 | 125 | # parsing 126 | table = Sharetable(snippet=segmentation_class.snippet) 127 | table.analyse_structure(content_lines, feature_lines) 128 | table.extract_content(content_lines, feature_lines) 129 | #from timeit import timeit 130 | #print(timeit(test)) 131 | # parsing 132 | self.ef.add_to_my_obj("shares", table.content, object_number=element_counter, 133 | only_filled=only_add_if_string) 134 | 135 | def parse_dividend(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 136 | # get basic data 137 | element_counter = 0 138 | origpost, origpost_red, element_counter, content_texts = \ 139 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 140 | 141 | # logme 142 | self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) 143 | 144 | # init 145 | only_add_if_string = True 146 | # self.config.LOG_SIMPLE = True 147 | if self.config.LOG_SIMPLE: 148 | # self.config.LOG_SIMPLE = False 149 | skip = origpost_red.replace("- ", "") 150 | 151 | # parsing 152 | self.ef.add_to_my_obj("dividende", skip, object_number=element_counter, 153 | only_filled=only_add_if_string) 154 | return True 155 | 156 | # parsing 157 | table = Dividendtable(snippet=segmentation_class.snippet) 158 | table.analyse_structure(content_lines, feature_lines) 159 | table.extract_content(content_lines, feature_lines) 160 | # from timeit import timeit 161 | # print(timeit(test)) 162 | # parsing 163 | self.ef.add_to_my_obj("dividende", table.content, object_number=element_counter, 164 | only_filled=only_add_if_string) 165 | -------------------------------------------------------------------------------- /lib/feature_extractor.py: -------------------------------------------------------------------------------- 1 | from akf_corelib.conditional_print import ConditionalPrint 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | from akf_corelib.random import Random 4 | 5 | import numpy as np 6 | 7 | 8 | class LineFeatures(): 9 | counter_special_chars = -1 10 | counter_alphanumerical_chars = -1 11 | counter_numbers = -1 12 | counter_chars = -1 13 | counter_alphabetical = -1 14 | counter_words = -1 15 | counter_spaces = -1 16 | counters_alphabetical_ratios = [] 17 | counters_wordlengths = [] 18 | counters_numbers = [] 19 | special_chars_ratio = -1 20 | alphanumerical_chars_ratio = -1 21 | alphabetical_ratio = -1 22 | spaces_ratio = -1 23 | numbers_ratio = -1 24 | 25 | x_box_sizes = [] 26 | x_gaps = [] 27 | 28 | maximum_x_gap = None 29 | mean_x_gap = None 30 | median_x_gap = None 31 | 32 | many_numbers_in_first_word = False 33 | many_alphabetical_in_middle_words = False 34 | many_alphabetical_in_last_word = False 35 | 36 | def __init__(self, cpr): 37 | self.cpr = cpr 38 | 39 | def print_me(self): 40 | self.cpr.print("alle cntr:", self.counter_chars) 41 | self.cpr.print("spec cntr:", self.counter_special_chars, "ratio", self.special_chars_ratio) 42 | self.cpr.print("alnr cntr:", self.counter_alphanumerical_chars, "ratio", self.alphanumerical_chars_ratio) 43 | self.cpr.print("albt cntr:", self.counter_alphabetical, "ratio", self.alphabetical_ratio) 44 | self.cpr.print("spce cntr:", self.counter_spaces, "ratio", self.spaces_ratio) 45 | self.cpr.print("nmbr cntr:", self.counter_numbers, "ratio", self.numbers_ratio) 46 | self.cpr.print("x_box_sizes", self.x_box_sizes) 47 | self.cpr.print("x_gaps", self.x_gaps) 48 | self.cpr.print("x_gap_max_size", self.maximum_x_gap) 49 | self.cpr.print("x_gaps_mean", self.mean_x_gap) 50 | self.cpr.print("x_gaps_median", self.median_x_gap) 51 | 52 | class FeatureExtractor(): 53 | 54 | def __init__(self): 55 | config_handler = ConfigurationHandler(first_init=False) 56 | 57 | self.config = config_handler.get_config() 58 | self.cpr = ConditionalPrint(self.config.PRINT_FEATURE_EXTRACTOR, self.config.PRINT_EXCEPTION_LEVEL, 59 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__ ) 60 | 61 | self.filter_start_words = ["Fernruf:", "Vorstand:", "Fernschreiber:", 62 | "von","Gründung:", "Ordnungsnr.", "Ordnungsnr", 63 | "Grundkapital:","Umstellung"] 64 | 65 | 66 | def extract_file_features(self, ocromore_data): 67 | all_line_features = [] 68 | for line in ocromore_data['lines']: 69 | current_line_features = self.extract_line_features(line) 70 | all_line_features.append(current_line_features) 71 | 72 | ocromore_data['line_features'] = all_line_features 73 | 74 | return ocromore_data 75 | 76 | 77 | def extract_line_features(self, line): 78 | 79 | final_line_features = {} 80 | 81 | whole_text = line['text'] 82 | 83 | self.cpr.print("recognizing text:", whole_text) 84 | 85 | # counters 86 | counter_special_chars = 0 87 | counter_alphanumerical_chars = 0 88 | counter_numbers = 0 89 | counter_chars = len(whole_text) 90 | counter_alphabetical = 0 91 | counter_words = 0 92 | counters_alphabetical_ratios = [] 93 | counters_wordlengths = [] 94 | counters_numbers = [] 95 | 96 | character_index = 0 97 | # special conditions 98 | ultimo_is_first_word = False 99 | first_word_no_table_indicator = False 100 | starts_with_parenthesis = False 101 | ends_with_parenthesis = False 102 | 103 | last_xstop = 0 104 | x_box_sizes = [] 105 | x_gaps = [] 106 | for word_obj in line['words']: 107 | word_index = word_obj['word_index'] 108 | word_text = word_obj['text'] 109 | hocr_coordinates = word_obj['hocr_coordinates'] 110 | 111 | word_xstart = hocr_coordinates[0] 112 | word_xstop = hocr_coordinates[2] 113 | word_box_size = word_xstop - word_xstart 114 | x_box_sizes.append(word_box_size) 115 | 116 | if word_index >= 1: 117 | x_gap = word_xstop - last_xstop 118 | x_gaps.append(x_gap) 119 | 120 | #line.data['word_x0'] 121 | if word_text is None or word_text == "": 122 | continue 123 | 124 | if word_index == 0: 125 | if word_text in self.filter_start_words: 126 | first_word_no_table_indicator = True 127 | if word_text.lower() == "ultimo": 128 | ultimo_is_first_word = True 129 | if word_text[0] == "(": 130 | starts_with_parenthesis = True 131 | 132 | 133 | if word_index == len(whole_text)-1: 134 | if word_text[-1] == ")": 135 | ends_with_parenthesis = True 136 | 137 | 138 | 139 | counter_alphabetical_chars_word = 0 140 | counter_alphanumerical_chars_word = 0 141 | counter_numbers_word = 0 142 | 143 | 144 | counter_words += 1 145 | 146 | word_list = list(word_text) 147 | for char in word_list: 148 | if Random.is_special_character(char): 149 | counter_special_chars += 1 150 | elif Random.is_alphanumerical_character(char): 151 | counter_alphanumerical_chars += 1 152 | counter_alphanumerical_chars_word += 1 153 | if char.isdigit(): 154 | counter_numbers += 1 155 | counter_numbers_word += 1 156 | 157 | counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word 158 | ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word_text), 2) 159 | counters_alphabetical_ratios.append(ratio_alphabetical_word) 160 | counters_wordlengths.append(len(word_text)) 161 | counters_numbers.append(counter_numbers_word) 162 | character_index += len(word_text) 163 | last_xstop = word_xstop 164 | 165 | 166 | # get number of spaces 167 | len_whole_unspace = len(whole_text.replace(" ", "")) 168 | counter_spaces = counter_chars - len_whole_unspace 169 | # set alphabetical counter 170 | counter_alphabetical = counter_alphanumerical_chars - counter_numbers 171 | 172 | 173 | if counter_chars == 0: 174 | self.cpr.printw("no chars in line:", str(line['line_index']),"no features here") 175 | return False 176 | 177 | special_chars_ratio = counter_special_chars/ counter_chars 178 | alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars 179 | alphabetical_ratio = counter_alphabetical / counter_chars 180 | spaces_ratio = counter_spaces/ counter_chars 181 | numbers_ratio = counter_numbers / counter_chars 182 | 183 | 184 | maximum_x_gap = None 185 | mean_x_gap = None 186 | median_x_gap = None 187 | 188 | if len(x_gaps) >= 1: 189 | maximum_x_gap = max(x_gaps) 190 | mean_x_gap = np.mean(x_gaps) 191 | median_x_gap = np.median(x_gaps) 192 | 193 | many_numbers_in_first_word = False 194 | many_alphabetical_in_middle_words = False 195 | many_alphabetical_in_last_word = False 196 | 197 | # check some middle and last word conditions 198 | for counter_index, counter in enumerate(counters_wordlengths): 199 | if counter_index == 0: 200 | ctr_numbers = counters_numbers[counter_index] 201 | numbers_ratio_word = np.round(ctr_numbers/counter,2) 202 | if numbers_ratio_word > 0.8: 203 | many_numbers_in_first_word = True 204 | elif counter_index == len(counters_wordlengths)-1: 205 | if counter >= 4: 206 | alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] 207 | if alphabetical_ratio_word >= 0.75: 208 | many_alphabetical_in_last_word = True 209 | 210 | else: 211 | if counter >= 4: 212 | alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] 213 | if alphabetical_ratio_word >= 0.75: 214 | many_alphabetical_in_middle_words = True 215 | 216 | 217 | 218 | 219 | 220 | final_line_features = LineFeatures(cpr=self.cpr) 221 | final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word 222 | 223 | final_line_features.counter_special_chars = counter_special_chars 224 | final_line_features.counter_chars = counter_chars 225 | final_line_features.counter_spaces = counter_spaces 226 | final_line_features.counter_numbers = counter_numbers 227 | final_line_features.counter_alphabetical = counter_alphabetical 228 | final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars 229 | final_line_features.counter_words = counter_words 230 | 231 | final_line_features.counters_numbers = counters_numbers 232 | final_line_features.counters_wordlengths = counters_wordlengths 233 | final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios 234 | 235 | final_line_features.numbers_ratio = numbers_ratio 236 | final_line_features.alphabetical_ratio = alphabetical_ratio 237 | final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio 238 | final_line_features.special_chars_ratio = special_chars_ratio 239 | final_line_features.spaces_ratio = spaces_ratio 240 | 241 | final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word 242 | final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words 243 | final_line_features.many_numbers_in_first_word = many_numbers_in_first_word 244 | final_line_features.x_box_sizes = x_box_sizes 245 | final_line_features.x_gaps = x_gaps 246 | 247 | final_line_features.maximum_x_gap = maximum_x_gap 248 | final_line_features.mean_x_gap = mean_x_gap 249 | final_line_features.median_x_gap = median_x_gap 250 | 251 | 252 | 253 | return final_line_features -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2017 Universitätsbibliothek Mannheim 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /lib/segment_parser.py: -------------------------------------------------------------------------------- 1 | from akf_corelib.conditional_print import ConditionalPrint 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | from .akf_parsing_functions_one import AkfParsingFunctionsOne 4 | from .akf_parsing_functions_two import AkfParsingFunctionsTwo 5 | from .akf_parsing_functions_three import AkfParsingFunctionsThree 6 | from .akf_parsing_functions_jk import AkfParsingFunctionsJK 7 | 8 | from .akf_parsing_functions_tables_one import AkfParsingFunctionsTablesOne 9 | 10 | from .data_helper import DataHelper 11 | from .segment_parser_endobject_factory import EndobjectFactory 12 | from lib.data_helper import DataHelper as dh 13 | from lib.snippet_ocr import Snippet 14 | import glob 15 | import os 16 | 17 | 18 | class FunctionMapAKF(object): 19 | """ 20 | This is a holder class which maps segment 21 | tags to parsing functions (here for AKF-Projekt) 22 | can be swapped for other projects 23 | """ 24 | 25 | def __init__(self, endobject_factory, output_analyzer, dictionary_handler): 26 | self.ef = endobject_factory 27 | self.akf_one = AkfParsingFunctionsOne(endobject_factory, output_analyzer, dictionary_handler) 28 | self.akf_two = AkfParsingFunctionsTwo(endobject_factory, output_analyzer, dictionary_handler) 29 | self.akf_three = AkfParsingFunctionsThree(endobject_factory, output_analyzer, dictionary_handler) 30 | self.akf_jk = AkfParsingFunctionsJK(endobject_factory, output_analyzer, dictionary_handler) 31 | 32 | self.akf_tables_one = AkfParsingFunctionsTablesOne(endobject_factory, output_analyzer, dictionary_handler) 33 | 34 | # for the keys use the keys from 'akf_segment_holder' or similar 35 | 36 | self.function_map = { 37 | "Firmenname": self.akf_one.parse_firmenname, 38 | "Sitz": self.akf_one.parse_sitz, 39 | "Verwaltung": self.akf_one.parse_verwaltung, 40 | "Telefon/Fernruf": self.akf_one.parse_telefon_fernruf, 41 | "Vorstand": self.akf_one.parse_vorstand, 42 | "Aufsichtsrat": self.akf_one.parse_aufsichtsrat, 43 | "Gründung": self.akf_one.parse_gruendung, 44 | "Arbeitnehmervertreter": self.akf_one.parse_arbeitnehmervertreter, 45 | "Tätigkeitsgebiet": self.akf_one.parse_taetigkeitsgebiet, 46 | "Zahlstellen": self.akf_two.parse_zahlstellen, 47 | "Grundkapital": self.akf_two.parse_grundkapital, 48 | "OrdnungsNrAktien": self.akf_two.parse_ordnungsnrdaktien, 49 | "Großaktionär": self.akf_two.parse_grossaktionaer, 50 | "Geschäftsjahr": self.akf_two.parse_geschaeftsjahr, 51 | "StimmrechtAktien": self.akf_two.parse_stimmrechtaktien, 52 | "Börsennotiz": self.akf_two.parse_boersennotiz, 53 | "Stückelung": self.akf_two.parse_stueckelung, 54 | "Aktienkurse": self.akf_jk.parse_aktienkurse, 55 | "Dividenden": self.akf_jk.parse_dividend, # is table 56 | "DividendenAufXYaktien": self.akf_jk.parse_dividend, # is table 57 | "BeratendeMitglieder": self.akf_three.parse_beratende_mitglieder, 58 | "Gesellschafter": self.akf_three.parse_gesellschafter,# not in first 500 files 1956?? 59 | "Sekretäre": self.akf_three.parse_sekretaere, # not in first 500 files 1956?? 60 | "Geschäftsleitung": self.akf_three.parse_geschaeftsleitung, # not in first 500 files 1956?? 61 | "Generaldirektion": self.akf_three.parse_generaldirektion, # not in first 500 files 1956?? 62 | "Direktionskomitee": self.akf_three.parse_direktionskomitee, # not in first 500 files 1956?? 63 | "Vizegeneraldirektoren": self.akf_three.parse_vizegeneraldirektoren, # not in first 500 files 1956?? 64 | "Fernschreiber": self.akf_three.parse_fernschreiber, 65 | "Filialen": self.akf_three.parse_filialen, # not a category in 1956 -> #todo maybe use later 66 | "Auslandsvertretungen": self.akf_three.parse_auslandsvertretungen, # not a category in 1956 -> #todo maybe use later 67 | "KommanditeUndBank": self.akf_three.parse_kommandite_und_bank, # not a category in 1956 -> #todo maybe use later 68 | "Niederlassungen": self.akf_three.parse_niederlassungen, 69 | "Erzeugnisse": self.akf_three.parse_erzeugnisse, 70 | "Haupterzeugnisse": self.akf_three.parse_haupterzeugnisse, 71 | "Spezialitäten": self.akf_three.parse_spezialitaeten, 72 | "Anlagen": self.akf_three.parse_anlagen, 73 | "Zweigniederlassungen": self.akf_three.parse_zweigniederlassungen, 74 | "Werke/Betriebsstätten": self.akf_three.parse_werke_betriebsstaetten, 75 | "Betriebsanlagen": self.akf_three.parse_betriebsanlagen, 76 | "Beteiligungsgesellschaften": self.akf_three.parse_beteiligungsgesellschaften, # not a category in 1956 -> #todo maybe use later 77 | "Beteiligungen": self.akf_three.parse_beteiligungen, 78 | "Tochtergesellschaften": self.akf_three.parse_tochtergesellschaften, 79 | "Wertpapier-Kenn-Nr": self.akf_three.parse_wertpapier_kenn_nr, # not a category in 1956 -> #todo maybe use later 80 | "RechteVorzugsaktien": self.akf_three.parse_rechte_und_vorzugsaktien, 81 | "Aktionäre": self.akf_three.parse_aktionaere, 82 | "Anleihen": self.akf_three.parse_anleihen, 83 | "KursVonZuteilungsrechten": self.akf_three.parse_kurse_v_zuteilungsrechten, 84 | "Emissionsbetrag": self.akf_three.parse_emissionsbetrag, 85 | "AusDenKonsolidiertenBilanzen": self.akf_jk.parse_bilanzen, # table 86 | "AusDenBilanzen": self.akf_jk.parse_bilanzen, # table 87 | "Konsolid.Gewinn-u.Verlustrechnungen": self.akf_jk.parse_gewinn_und_verlust, # table 88 | "AusGewinnVerlustrechnungen": self.akf_jk.parse_gewinn_und_verlust, # @jk last element works now 89 | "Bezugsrechte": self.akf_three.parse_bezugsrechte, 90 | "ZurGeschäftslage": self.akf_three.parse_geschaeftslage 91 | } 92 | 93 | def get_function_map(self): 94 | return self.function_map 95 | 96 | 97 | 98 | 99 | class SegmentParser(object): 100 | """ 101 | Parse the classified segments segment by segment, 102 | each segment defined code the parser points to. 103 | """ 104 | 105 | def __init__(self, output_analyzer, dictionary_handler, ocromore_data=None): 106 | 107 | self.ef = EndobjectFactory() 108 | self.dictionary_handler = dictionary_handler 109 | 110 | # map which maps tags to functions for parsing -> change constuctor for other project 111 | fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler) 112 | 113 | config_handler = ConfigurationHandler(first_init=False) 114 | 115 | self.config = config_handler.get_config() 116 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER, self.config.PRINT_EXCEPTION_LEVEL, 117 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) 118 | 119 | self.function_map = fmap.get_function_map() 120 | self.result_root = self.config.OUTPUT_ROOT_PATH + "/results/" 121 | 122 | def clear_result(self, output_analyzer, dictionary_handler, ocromore_data=None): 123 | # create a new end object factory, new content 124 | self.ef = EndobjectFactory() 125 | # map to the new ef object which has been recreated 126 | fmap = FunctionMapAKF(self.ef, output_analyzer, dictionary_handler) 127 | self.function_map = fmap.get_function_map() 128 | 129 | 130 | def parse_segments(self, ocromore_data): 131 | self.ocromore_data = ocromore_data 132 | segmentation = ocromore_data['segmentation'] 133 | segmentation_classes = segmentation.my_classes 134 | 135 | # add all text from original file if activated (i.e. for debugging purposes) 136 | if self.config.ADD_FULLTEXT_ENTRY: 137 | all_texts = self.get_all_text(ocromore_data) 138 | self.ef.set_current_main_list("overall_info") 139 | self.ef.add_to_my_obj("fulltexts",all_texts) 140 | # add additional info to result 141 | if self.config.ADDITIONAL_INFORMATION and self.config.ADD_ADDITIONAL_INFO: 142 | if not self.config.ADD_FULLTEXT_ENTRY: 143 | self.ef.set_current_main_list("Information") 144 | self.ef.add_to_my_obj("additionals", ocromore_data["additional_info"]) 145 | # add a duplicate of the original text from which in the below analysis case the files get subtracted 146 | if self.config.LOG_SEGMENTED_TO_ORIG_DIFF_PER_FILE: 147 | if self.config.ADD_FULLTEXT_ENTRY: 148 | ocromore_data['analysis_to_orig'] = {} 149 | original_rest, complete_text = self.get_all_text(ocromore_data, join_separated_lines=True) 150 | ocromore_data['analysis_to_orig']['original_rest'] = original_rest 151 | ocromore_data['analysis_to_orig']['original_length_initial'] = len(complete_text) 152 | else: 153 | self.cpr.printw("activated segment to orig diff, but no saving of origin activate ADD_FULLTEXT_ENTRY " 154 | "in config for this functionality") 155 | 156 | 157 | 158 | #Init toolbbox 159 | snippet = None 160 | if self.config.USE_SNIPPET: 161 | if "./" in self.config.IMGPATH: 162 | ipath = os.path.dirname(ocromore_data["file_info"].path)+self.config.IMGPATH[1:] 163 | else: 164 | ipath = os.path.normcase(self.config.IMGPATH) 165 | results = glob.glob(ipath+ocromore_data["file_info"].name.split(".")[0].replace("_msa_best","")+"*",recursive=True) 166 | if results: 167 | snippet = Snippet() 168 | snippet.imread(results[0]) 169 | else: 170 | self.config.USE_TOOLBBOX = False 171 | info_handler = {} 172 | # start parsing for each successfully segmented area 173 | for segmentation_class in segmentation_classes: 174 | 175 | # if the class segment was recognized ... 176 | if segmentation_class.is_start_segmented(): 177 | # get the unique identifier for this class 178 | segment_tag = segmentation_class.get_segment_tag() 179 | segmentation_class.snippet = snippet 180 | segmentation_class.info_handler = info_handler 181 | self.trigger_mapped_function(segment_tag, segmentation_class, ocromore_data) 182 | 183 | 184 | # add and return result 185 | ocromore_data['results'] = self.ef 186 | return ocromore_data 187 | 188 | def trigger_mapped_function(self, segment_tag, segmentation_class, ocromore_data): 189 | 190 | if segment_tag not in self.function_map.keys(): 191 | return 192 | #todo: fileinfo -> parsing 193 | real_start_tag, content_texts, content_lines, feature_lines = self.prepare_parsing_info(segmentation_class, ocromore_data) 194 | 195 | # switch the object to save context 196 | segment_tag = segmentation_class.segment_tag 197 | self.ef.set_current_main_list(segment_tag) 198 | 199 | # call the mapped function, which fills the end-factory 200 | self.function_map[segment_tag].__call__(real_start_tag, content_texts, content_lines, feature_lines, segmentation_class) 201 | 202 | def prepare_parsing_info(self, segmentation_class, ocromore_data): 203 | lines = ocromore_data['lines'] 204 | line_features = ocromore_data['line_features'] 205 | real_start_tag, content_texts, content_lines, feature_lines = \ 206 | DataHelper.get_content(lines,line_features, segmentation_class) 207 | 208 | return real_start_tag, content_texts, content_lines, feature_lines 209 | 210 | def get_all_text(self, ocromore_data, join_separated_lines=False): 211 | """ 212 | Gets all text lines in ocromore_data as 213 | array and as joined string 214 | :param ocromore_data: data from which the text is extracted 215 | :return: texts list, complete text 216 | """ 217 | all_texts = [] 218 | complete_text = "" 219 | for line in ocromore_data['lines']: 220 | text = line['text'] 221 | all_texts.append(text) 222 | complete_text += text 223 | 224 | if join_separated_lines: 225 | complete_text = "" 226 | all_texts = dh.join_separated_lines(all_texts) 227 | for text in all_texts: 228 | complete_text += text 229 | 230 | return all_texts, complete_text 231 | 232 | def write_result_to_output(self, as_json, ocromore_data): 233 | if as_json is True: 234 | my_json = self.ef.export_as_json() 235 | my_json_lines = my_json.split("\n") 236 | dh.write_array_to_root("result_json/", my_json_lines, ocromore_data, self.result_root) -------------------------------------------------------------------------------- /lib/segment_parser_endobject_factory.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pprint 3 | from akf_corelib.conditional_print import ConditionalPrint 4 | from akf_corelib.configuration_handler import ConfigurationHandler 5 | from lib.akf_known_uncategories import KnownUncategories 6 | 7 | class EndobjectFactory(object): 8 | """ 9 | Creates an object with the following structure and provides exporting methods: 10 | 11 | segment_tag_1: [ ---> this level is created by set_current_main_list 12 | { 13 | type: "Sitz" ---> add this level entries with add_to_my_object object_number=0 14 | city: "Neustadt" 15 | }, 16 | { 17 | type: "Sitz" ---> add this level entries with add_to_my_object object_number=0 18 | city: "Neustadt" 19 | } 20 | 21 | ], 22 | segment_tag_2: [ 23 | { 24 | ... 25 | } 26 | ... 27 | ] 28 | """ 29 | def __init__(self): 30 | self.my_object = {} 31 | self.current_main_list = None 32 | self.pp = pprint.PrettyPrinter(indent=5) 33 | 34 | config_handler = ConfigurationHandler(first_init=False) 35 | 36 | self.config = config_handler.get_config() 37 | self.cpr = ConditionalPrint(self.config.PRINT_OUTPUT_ANALYSIS, self.config.PRINT_EXCEPTION_LEVEL, 38 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) 39 | 40 | if self.config.REMOVE_TAGS_IN_ORIG_DIFF: 41 | self.known_uc = KnownUncategories() 42 | 43 | def set_current_main_list(self, segment_tag): 44 | if segment_tag not in self.my_object.keys(): 45 | self.my_object[segment_tag] = [] # create the main list (all subsequent entries are stored here) 46 | 47 | self.current_main_list = self.my_object[segment_tag] # create a short link on the main list 48 | 49 | def add_to_my_obj(self, key, value, object_number=0, only_filled=False): 50 | 51 | if only_filled is True and (value == None or value == "" or value == [] or value == {}): 52 | return False 53 | 54 | # fill main list if object index not in 55 | len_list = len(self.current_main_list) 56 | if len_list < object_number+1: 57 | for index in range(len_list,object_number+1): 58 | self.current_main_list.append({}) 59 | 60 | self.cpr.print("Adding value to List,- ObjectNr.:", object_number,"Key:", key, "Value:", value) 61 | # add or insert to the main_list 62 | self.current_main_list[object_number][key] = value 63 | return True 64 | 65 | def print_me_and_return(self): 66 | print("my_object is:") 67 | self.pp.pprint(self.my_object) 68 | return self.my_object 69 | 70 | def print_current_main(self): 71 | print("current_main:") 72 | self.pp.pprint(self.current_main_list) 73 | 74 | def export_as_json(self): 75 | my_obj_json = json.dumps(self.my_object, indent=5, ensure_ascii=False) 76 | return my_obj_json 77 | 78 | def export_as_json_at_key(self, key, remove_first_object=False): 79 | 80 | if key not in self.my_object.keys(): 81 | return None 82 | 83 | my_obj = self.my_object[key] 84 | if remove_first_object: 85 | if len(my_obj) >= 1: 86 | my_obj = my_obj[1:] # remove the first object which usally contains generic info 87 | 88 | my_obj_json = json.dumps(my_obj, indent=5, ensure_ascii=False) 89 | return my_obj_json 90 | 91 | @staticmethod 92 | def fetch_subentries_recursive_check(entry): 93 | """ 94 | Fetches all subentries (values) from an entry and writes them to a list of texts 95 | This get's called recursively within the function until all subentries 96 | are found 97 | :param entry: entry to fetch the subentries from 98 | :return: list of subentries 99 | """ 100 | final_texts = [] 101 | 102 | for item in entry: 103 | if isinstance(entry, list): 104 | value = item 105 | else: 106 | # item is a key 107 | value = entry[item] 108 | if isinstance(value, str): 109 | final_texts.append(value) 110 | elif isinstance(value, int): 111 | final_texts.append(str(value)) 112 | elif isinstance(value, object): 113 | obj_size = len(value) 114 | if obj_size > 0: 115 | recursive_texts = EndobjectFactory.fetch_subentries_recursive_check(value) 116 | final_texts.extend(recursive_texts) 117 | 118 | return final_texts 119 | 120 | @staticmethod 121 | def fetch_keys_recusive_check(entry, final_keys, create_multiple=True): 122 | """ 123 | Fetches all keys in an object and it's sub-objects 124 | calls itself recursively until all keys are found 125 | writes final keys to final_keys array and returns this 126 | :param entry: object to fetch the sub-keys from 127 | :param final_keys: list of final keys (initial state) 128 | :param create_multiple: if the same key occurs multiple times it still gets added 129 | :return: final_keys with added keys from object 130 | """ 131 | 132 | if isinstance(entry, list): 133 | for item in entry: 134 | final_keys = EndobjectFactory.fetch_keys_recusive_check(item, final_keys, create_multiple) 135 | return final_keys 136 | elif not isinstance(entry, dict): 137 | # just return if there are no keys (cause no dictionary) 138 | return final_keys 139 | 140 | for key in entry: 141 | value = entry[key] 142 | if create_multiple or key not in final_keys: 143 | if isinstance(key, int): 144 | continue 145 | final_keys.append(key) 146 | final_keys = EndobjectFactory.fetch_keys_recusive_check(value, final_keys) 147 | return final_keys 148 | 149 | def diff_seg_to_orig_at_key(self, key): 150 | """ 151 | def fetch_subentries_recursive(entry): 152 | final_texts = [] 153 | 154 | for item in entry: 155 | if isinstance(entry, list): 156 | value = item 157 | else: 158 | # item is a key 159 | value = entry[item] 160 | if isinstance(value, str): 161 | final_texts.append(value) 162 | elif isinstance(value, int): 163 | final_texts.append(str(value)) 164 | elif isinstance(value, object): 165 | obj_size = len(value) 166 | if obj_size > 0: 167 | recursive_texts = fetch_subentries_recursive(value) 168 | final_texts.extend(recursive_texts) 169 | 170 | return final_texts 171 | """ 172 | if key not in self.my_object.keys(): 173 | return None 174 | 175 | my_data = self.my_object[key] 176 | 177 | # check if the orig-post property can exist warn if not 178 | if not self.config.ADD_INFO_ENTRY_TO_OUTPUT: 179 | self.cpr.printw("trying to fetch original data, original data is not added to results") 180 | self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True") 181 | if len(my_data) <= 0: 182 | self.cpr.printw("no data to do returning") 183 | return 184 | 185 | return # todo this seems to be wrong 186 | # copy orig string 187 | original_text = my_data[0]['origpost'] 188 | rest_text = original_text 189 | 190 | # fetch parsed entries for diff 191 | all_final_entries = [] # array of final entries 192 | for index in range(1, len(my_data)): 193 | entry = my_data[index] 194 | final_entries = fetch_subentries_recursive(entry) 195 | all_final_entries.extend(final_entries) 196 | 197 | # order diff data after length 198 | all_final_entries.sort(key=lambda x: len(x)) 199 | all_final_entries.reverse() 200 | 201 | # subtract 202 | for text in all_final_entries: 203 | rest_text = rest_text.replace(text, "") 204 | 205 | rest_text = rest_text.strip() 206 | 207 | return rest_text, original_text 208 | 209 | def diff_parsed_to_orig_at_key(self, key): 210 | """ 211 | def fetch_subentries_recursive(entry): 212 | final_texts = [] 213 | 214 | for item in entry: 215 | if isinstance(entry, list): 216 | value = item 217 | else: 218 | # item is a key 219 | value = entry[item] 220 | if isinstance(value, str): 221 | final_texts.append(value) 222 | elif isinstance(value, int): 223 | final_texts.append(str(value)) 224 | elif isinstance(value, object): 225 | obj_size = len(value) 226 | if obj_size > 0: 227 | recursive_texts = fetch_subentries_recursive(value) 228 | final_texts.extend(recursive_texts) 229 | 230 | return final_texts 231 | 232 | def fetch_keys_recusive(entry, final_keys, create_multiple=True): 233 | # just return if there are no keys (cause no dictionary) 234 | if not isinstance(entry, dict): 235 | return final_keys 236 | 237 | for key in entry: 238 | value = entry[key] 239 | if create_multiple or key not in final_keys: 240 | if isinstance(key, int): 241 | continue 242 | final_keys.append(key) 243 | final_keys = fetch_keys_recusive(value, final_keys) 244 | return final_keys 245 | """ 246 | if key not in self.my_object.keys(): 247 | return None 248 | 249 | #if key == "KursVonZuteilungsrechten": 250 | # print("todo remove debug") 251 | 252 | my_data = self.my_object[key] 253 | 254 | # check if the orig-post property can exist warn if not 255 | if not self.config.ADD_INFO_ENTRY_TO_OUTPUT: 256 | self.cpr.printw("trying to fetch original data, original data is not added to results") 257 | self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True") 258 | if len(my_data) <= 0: 259 | self.cpr.printw("no data to do returning") 260 | return 261 | # copy orig string 262 | original_text = my_data[0]['origpost'] 263 | rest_text = original_text 264 | 265 | # fetch parsed entries for diff 266 | pool_entries = [] # array of final entries 267 | for index in range(1, len(my_data)): 268 | entry = my_data[index] 269 | final_entries = EndobjectFactory.fetch_subentries_recursive_check(entry) 270 | pool_entries.extend(final_entries) 271 | 272 | if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True: 273 | # removes all spaces from rest and comparison values because spaces are often 274 | # a problem in subtracting the rests 275 | rest_text = rest_text.replace(" ", "") 276 | for index in range(0,len(pool_entries)): 277 | pool_entries[index] = pool_entries[index].replace(" ", "") 278 | 279 | all_final_entries = [] 280 | 281 | # add the entries to the complete subtraction and tag them with '1' 282 | for pentry in pool_entries: 283 | all_final_entries.append((pentry, 1)) 284 | 285 | # if keys shall be subracted also add them also 286 | if self.config.REMOVE_TAGS_IN_ORIG_DIFF: 287 | pool_keys = [] # gets multiple of the same key for later 1 by 1 subtraction 288 | for index in range(1, len(my_data)): 289 | pool_keys = EndobjectFactory.fetch_keys_recusive_check(my_data[index], pool_keys, create_multiple=True) 290 | 291 | # also remove spaces in keys 292 | if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True: 293 | for index in range(0, len(pool_keys)): 294 | pool_keys[index] = pool_keys[index].replace(" ", "") 295 | 296 | final_keys = [] 297 | for pkey in pool_keys: 298 | final_keys.append((pkey, 2)) 299 | 300 | all_final_entries.extend(final_keys) 301 | 302 | # order diff data after length 303 | all_final_entries.sort(key=lambda x: len(x[0])) 304 | all_final_entries.reverse() 305 | 306 | # subtract 307 | for entry in all_final_entries: 308 | text = entry[0] 309 | text_or_key = entry[1] 310 | if text_or_key == 2: 311 | if text in self.known_uc.unkeys: 312 | continue 313 | text_stripped = text.strip() # remove spaces so texts better fit in 314 | rest_text = rest_text.replace(text_stripped, "", 1) 315 | rest_text = rest_text.strip() 316 | 317 | return rest_text, original_text 318 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![docxstruct](docs/img/docxstruct_logo.png "docxstruct") 2 | ============ 3 | ![license](https://img.shields.io/badge/license-Apache%20License%202.0-blue.svg) 4 | 5 | Docxstruct parses .hocr-output of [ocromore][ocromore-link] to get a content-classified .json output 6 | for further database export. It is part of the [Aktienführer-Datenarchiv work process][akf-link], 7 | but can also be used independently. 8 | 9 | # Installation 10 | 11 | To initialize the git submodules (~git version 2.7.4): 12 | 13 | ` 14 | git submodule update --init --recursive 15 | ` 16 | 17 | For development Pycharm IDE 2017.3 Community Edition was used 18 | 19 | 20 | If using the Pycharm IDE to look at accumulated segmentation analysis files adapt the IDE settings to have proper view. 21 | This is in idea.properties-file which can i.e. be found over Help->Edit Custom Properties in Pycharm: 22 | 23 | 24 | ` 25 | editor.soft.wrap.force.limit=10000 26 | ` 27 | 28 | 29 | # Handling Code 30 | `Docxstruct` is made to be adapted for parsing other kinds of content 31 | than *Aktienführer data*. It can be used as generic text-content recognizer and classifier. 32 | Therefore it provides lot's of analysis and structure for that. 33 | 34 | Usually all akf-specific content is stored in files which are called *akf_XXX* 35 | this are the parts where you might want to put your custom functionalities. 36 | 37 | Ways how to do that are described in the following documentation parts. 38 | 39 | # input output example 40 | In an example with Aktienführer data io is explained. This is the basic input which is usually in a hocr file. 41 | ``` 42 | OCR Results 43 | 44 | 45 | 46 | 47 | 48 |
49 |
50 | 51 |
52 |
53 | Überlandwerk 54 | Unterfranken 55 | Aktiengesellschatt 56 |
57 | = 1 and text[len_text-1] == ":": 31 | text = text[0:len_text-1] 32 | 33 | return text 34 | 35 | 36 | @staticmethod 37 | def get_rest_content_start_line(segmentation_class, start_line, trim=True): 38 | text = start_line['text'] 39 | stop = segmentation_class.key_tag_cindex_stop 40 | rest_start = text[stop:] 41 | if trim: 42 | rest_start = rest_start.strip() 43 | return rest_start 44 | 45 | @staticmethod 46 | def remove_multiple_outbound_chars(text): 47 | """ 48 | Strips the left and the right side of special characters in a string 49 | and returns the stripped version then: 50 | example ".;my text is;,,," returns "my text is" 51 | :param text: input text 52 | :return: filtered text 53 | """ 54 | # print("input:", text) 55 | 56 | text_to_change = text 57 | 58 | # filter left side 59 | match_l = regex.search(r"^[^\w\s]*(?.*)", text_to_change) 60 | if match_l: 61 | rest = match_l.group("tag") 62 | text_to_change = rest 63 | 64 | if text_to_change == "": 65 | return text_to_change 66 | 67 | # filter right side 68 | match_r2 = regex.search(r"(?P[^\w\s]*)$", text_to_change) 69 | 70 | if match_r2: 71 | rest = match_r2.group("right_rest") 72 | text_to_change = DataHelper.rreplace(text_to_change, rest) 73 | 74 | # print("output:", text_to_change) 75 | return text_to_change 76 | 77 | @staticmethod 78 | def rreplace(text, replace_text): 79 | """ 80 | Replace text from the right hand side of a string 81 | by reversing the strings 82 | :param text: input text 83 | :return: filtered text 84 | """ 85 | reverse_text = text[::-1] 86 | reverse_replace_text = replace_text[::-1] 87 | new_reverse_text = reverse_text.replace(reverse_replace_text, "") 88 | new_text = new_reverse_text[::-1].strip() 89 | 90 | return new_text 91 | 92 | 93 | @staticmethod 94 | def get_content(segment_lines, feature_lines, segmentation_class): 95 | start_index = segmentation_class.get_start_line_index() 96 | stop_index = segmentation_class.get_stop_line_index() 97 | selected_start_line = segment_lines[start_index] 98 | feature_start_line = feature_lines[start_index] 99 | real_tag = DataHelper.get_real_tag_from_segment(segmentation_class, selected_start_line) 100 | rest_content_start_line = DataHelper.get_rest_content_start_line(segmentation_class, selected_start_line) 101 | 102 | # if there are no further line, return obtained content 103 | if start_index == stop_index: 104 | return real_tag, [rest_content_start_line], [selected_start_line], [feature_start_line] 105 | 106 | # otherwise fetch the rest of the content 107 | other_rest_content_texts = [] 108 | other_rest_content_lines = [] 109 | other_rest_feature_lines = [] 110 | 111 | other_rest_content_texts.append(rest_content_start_line) 112 | other_rest_content_lines.append(selected_start_line) 113 | other_rest_feature_lines.append(feature_start_line) 114 | 115 | for current_index in range(start_index+1, stop_index+1): 116 | current_line = segment_lines[current_index] 117 | current_feature_lines = feature_lines[current_index] 118 | other_rest_content_texts.append(current_line['text']) 119 | other_rest_content_lines.append(current_line) 120 | other_rest_feature_lines.append(current_feature_lines) 121 | 122 | return real_tag, other_rest_content_texts, other_rest_content_lines, other_rest_feature_lines 123 | 124 | 125 | @staticmethod 126 | def write_array_to_root_simple(base_path, tag, text_lines, analysis_root, append_mode=False): 127 | full_dir = analysis_root + base_path + "/" 128 | full_path = full_dir + tag + ".txt" 129 | 130 | fh.create_directory_tree(full_dir) 131 | # write append or normal 132 | if append_mode is True: 133 | my_file = io.open(full_path, 'a', encoding='utf8') 134 | else: 135 | my_file = io.open(full_path, 'w', encoding='utf8') 136 | 137 | for text_line in text_lines: 138 | my_file.write(text_line+"\n") 139 | 140 | my_file.close() 141 | 142 | @staticmethod 143 | def write_array_to_root(base_path, text_lines, ocromore_data, analysis_root, accumulated=False): 144 | """ 145 | Writes a line-array to the base path in root path with ocromore data file and db name 146 | :param base_path: 147 | :param text_lines: 148 | :param ocromore_data: 149 | :param analysis_root: root path in base directory 150 | :param accumulated: file is accumulated file naming different 151 | :return: 152 | """ 153 | 154 | dbpath = ocromore_data['file_info'].dbpath 155 | tablename = ocromore_data['file_info'].tablename 156 | 157 | full_dir = analysis_root + base_path + dbpath+"/" 158 | if accumulated is False: 159 | full_path = full_dir + tablename + ".txt" 160 | else: 161 | full_path = full_dir +"accumulated_report"+".txt" 162 | 163 | fh.create_directory_tree(full_dir) 164 | 165 | my_file = io.open(full_path, 'w', encoding='utf8') 166 | 167 | for text_line in text_lines: 168 | my_file.write(text_line+"\n") 169 | 170 | my_file.close() 171 | 172 | @staticmethod 173 | def create_stringified_linearray(array_of_texts): 174 | final_string = "" 175 | for line_text in array_of_texts: 176 | final_string += line_text+"\n" 177 | 178 | final_string = final_string.strip() 179 | return final_string, final_string.replace("\n", " ") 180 | 181 | @staticmethod 182 | def strip_if_not_none(text, strip_pattern): 183 | if text is None: 184 | return text 185 | else: 186 | if strip_pattern != "": 187 | return text.strip(strip_pattern) 188 | else: 189 | return text.strip() 190 | 191 | @staticmethod 192 | def join_joined_lines(joined_texts, add_spaces=True): 193 | """ 194 | Takes the output from 'join_separated_lines' and joins the lines to one 195 | string 196 | :param joined_texts: array of texts 197 | :param add_spaces: add a space between joined texts 198 | :return: joined string 199 | """ 200 | return_text = "" 201 | 202 | for text in joined_texts: 203 | if add_spaces is True: 204 | return_text += " "+text 205 | else: 206 | return_text += text 207 | 208 | return_text = return_text.strip() 209 | 210 | return return_text 211 | 212 | 213 | @staticmethod 214 | def join_separated_lines(content_texts): 215 | """ 216 | Joins dash separated lines in the text list (reduces the number of entries, if 217 | there are such lines) 218 | :param content_texts: text list to join 219 | :return: text array where all dash separated lines are joined 220 | """ 221 | 222 | # final array with joined texts 223 | joined_texts = [] 224 | # intermediate array for storing tagged lines (normal line:0 or separator_line:1) 225 | NORMAL_LINE = 0 226 | SEPARATOR_LINE = 1 227 | LAST_LINE = 2 228 | 229 | tagged_texts = [] 230 | 231 | len_content_texts = len(content_texts) 232 | 233 | #if len_content_texts == 42: 234 | # print("asd") 235 | 236 | # iterate the given texts 237 | for text_index, text in enumerate(content_texts): 238 | if text is None: 239 | continue 240 | #if "Kommanditeinlagen" in text: 241 | # print("asd") 242 | 243 | # if there is one, get the follow up text 244 | next_text = None 245 | if text_index < len_content_texts - 1: 246 | next_text = content_texts[text_index + 1].strip() 247 | 248 | # detect line with separator 249 | if (len(text) >= 2 and "-" in text[-1]): 250 | line_ends_with_amount = False 251 | 252 | # this is a line which ends with a amount indicator like '6 500 000. -' 253 | # and therefore no separator 254 | if len(text) >= 3 and "-" in text[-1] and " " in text[-2] and "." in text[-3]: 255 | line_ends_with_amount = True 256 | elif len(text) >= 2 and "-" in text[-1] and "." in text[-2]: 257 | line_ends_with_amount = True 258 | elif len(text) >= 2 and "-" in text[-1] and text[-2].isdigit(): 259 | line_ends_with_amount = True # no amount, but similar case it's a timespan '1996-\n1997' or similar 260 | 261 | if not line_ends_with_amount and next_text is not None and len(next_text) >= 1: 262 | 263 | # if the next starting letter is uppercase don't do the joining (assuming it's a '-' 264 | # separated Name like "Jan-Phillipp") 265 | if not next_text[0].isupper(): 266 | # fetch the next text in current and remove separator 267 | text = text[0:len(text) - 1] 268 | # store in tagged texts 269 | tagged_texts.append((text, SEPARATOR_LINE)) 270 | continue 271 | 272 | if text_index >= len_content_texts: 273 | tagged_texts.append((text, LAST_LINE)) 274 | break 275 | 276 | # append to tagged texts 277 | tagged_texts.append((text, NORMAL_LINE)) 278 | 279 | # join the tagged texts 280 | 281 | for current_index, ttext_info in enumerate(tagged_texts): 282 | if ttext_info == None: 283 | continue # line was already joined 284 | 285 | current_ttext, current_id = ttext_info 286 | if current_id == NORMAL_LINE: 287 | joined_texts.append(current_ttext) 288 | elif current_id == SEPARATOR_LINE: 289 | # check all follow up lines 290 | for follow_up_index in range(current_index+1, len(tagged_texts)): 291 | follow_ttext, follow_id = tagged_texts[follow_up_index] 292 | current_ttext = current_ttext + follow_ttext 293 | tagged_texts[follow_up_index] = None 294 | if follow_id == NORMAL_LINE or follow_id == LAST_LINE: 295 | # update my new array 296 | joined_texts.append(current_ttext) 297 | break # done escape the inner loop 298 | elif follow_id == SEPARATOR_LINE: 299 | continue # continue inner loop 300 | 301 | # return the modified list 302 | return joined_texts 303 | 304 | @staticmethod 305 | def join_separated_lines_parenthesis(content_texts): 306 | next_lines_is_ending_parenthesis = False # indicator - 307 | next_closing_ordinal = -1 # indicator - the n-th closing parenthesis closes the previous block 308 | change = False 309 | final_entries = [] 310 | 311 | len_content_texts = len(content_texts) 312 | for text_index, text in enumerate(content_texts): 313 | 314 | # if there was a case detect add this line to the previous one instead of appending as new line 315 | if next_lines_is_ending_parenthesis: 316 | 317 | text_split = text.split(')') 318 | text_to_add = "" 319 | rest_text = "" 320 | # define next closing ordinal, sometimes overflow todo this is not 100% accurate 321 | used_closing_ordinal = 0 322 | if next_closing_ordinal > 0: 323 | used_closing_ordinal = next_closing_ordinal 324 | for tf_index, text_fragment in enumerate(text_split): 325 | if tf_index <= used_closing_ordinal: 326 | text_to_add += " " + text_fragment+")" 327 | else: 328 | if text_fragment.strip != "": 329 | # only add delimiters if not at end of split 330 | if tf_index == len(text_split)-1: 331 | rest_text += " " + text_fragment 332 | else: 333 | rest_text += " " + text_fragment+")" 334 | 335 | final_entries[-1] += " " + text_to_add.strip() # add until parenthesis end then go on 336 | next_lines_is_ending_parenthesis = False 337 | change = True # change debugging indicator 338 | # change current text to only rest 339 | text = rest_text.strip() 340 | #print(final_entries) 341 | if text == ")": 342 | continue 343 | 344 | # check if there is more opening parenthesis 345 | opening_parenthesis = text.count("(") 346 | closing_parenthesis = text.count(")") 347 | 348 | if opening_parenthesis <= closing_parenthesis: 349 | final_entries.append(text) 350 | continue 351 | 352 | # assign next text otherwise continue 353 | next_text = None 354 | if text_index+1 < len_content_texts: 355 | next_text = content_texts[text_index + 1] 356 | else: 357 | final_entries.append(text) 358 | continue 359 | 360 | next_opening_parentesis = next_text.count("(") 361 | next_closing_parenthesis = next_text.count(")") 362 | 363 | if next_closing_parenthesis == 0: 364 | final_entries.append(text) 365 | continue 366 | 367 | # if code ran until here the lines are a concat case 368 | final_entries.append(text) 369 | next_lines_is_ending_parenthesis = True 370 | next_closing_ordinal = opening_parenthesis-closing_parenthesis - next_closing_parenthesis 371 | 372 | #if change: 373 | # print("debug") 374 | 375 | return final_entries 376 | 377 | @staticmethod 378 | def filter_special_chars(text, remove_spaces=True): 379 | """ 380 | Remove special characters from input text 381 | :param text: input text 382 | :param remove_spaces: if true also removes spaces 383 | :return: filtered text 384 | """ 385 | 386 | if remove_spaces: 387 | text_filtered = re.sub('[^A-Za-z0-9]+', '', text) 388 | else: 389 | text_filtered = re.sub('[^A-Za-z0-9\s]+', '', text) 390 | 391 | return text_filtered -------------------------------------------------------------------------------- /lib/segment_classifier.py: -------------------------------------------------------------------------------- 1 | from akf_corelib.conditional_print import ConditionalPrint 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | from lib.akf_segment_holder import SegmentHolder 4 | from lib.data_helper import DataHelper as dh 5 | import inspect 6 | 7 | class SegmentClassifier(object): 8 | """ 9 | This is the basic handler for classification 10 | which get's accessed from root/-outside classes. 11 | """ 12 | 13 | def __init__(self): 14 | 15 | config_handler = ConfigurationHandler(first_init=False) 16 | 17 | self.config = config_handler.get_config() 18 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_CLASSIFIER, self.config.PRINT_EXCEPTION_LEVEL, 19 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) 20 | self.cpr.print("init segment classifier") 21 | 22 | def classify_file_segments(self, ocromore_data): 23 | lines = ocromore_data['lines'] 24 | feats = ocromore_data['line_features'] 25 | file_info = ocromore_data['file_info'] 26 | all_file_segments = AllSegments(len(lines), self.cpr, self.config) 27 | 28 | prev_line = None 29 | prev_text = None 30 | for current_line_index, current_line in enumerate(lines): 31 | current_features = feats[current_line_index] 32 | current_text = current_line['text'] 33 | current_index = current_line['line_index'] 34 | # create a combined lined object with optimized (removed) separation 35 | combined_line = None 36 | if prev_line is not None: 37 | combined_lines = dh.join_separated_lines([prev_text, current_text]) 38 | combined_line = dh.join_joined_lines(combined_lines) 39 | else: 40 | combined_line = current_text 41 | # pass parameters to matching functions 42 | all_file_segments.match_my_segments(current_line, current_text, current_index, current_features, 43 | prev_line, combined_line) 44 | prev_line = current_line 45 | prev_text = current_text 46 | 47 | 48 | 49 | 50 | if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION: 51 | self.adapt_non_explicit_indices(all_file_segments) 52 | else: 53 | all_file_segments.correct_overlaps_index_field(only_start_tags=True) 54 | 55 | self.adapt_stop_index_in_last_segment(all_file_segments) 56 | 57 | 58 | # does the last steps in segment matching 59 | all_file_segments.finish_segment_matching(lines, feats, file_info) 60 | 61 | # do again after final step 62 | if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION: 63 | self.adapt_non_explicit_indices(all_file_segments) 64 | else: 65 | all_file_segments.correct_overlaps_index_field(only_start_tags=True) 66 | 67 | self.adapt_stop_index_in_last_segment(all_file_segments) 68 | 69 | 70 | 71 | 72 | ocromore_data['segmentation'] = all_file_segments 73 | 74 | return ocromore_data 75 | 76 | 77 | def adapt_stop_index_in_last_segment(self, all_file_segments): 78 | """ 79 | Sets the stop_index for the last recognized segment, which 80 | is a special case and is usually not filled beforehand, because 81 | there is no next start index 82 | :param all_file_segments: holder object for segment classes and other info 83 | :return: None 84 | """ 85 | 86 | # search for last segment 87 | saved_start_index = -1 88 | saved_last_segment = None 89 | for segment in all_file_segments.my_classes: 90 | # only count segmented segments 91 | if segment.start_was_segmented is False: 92 | continue 93 | 94 | if segment.start_line_index >= saved_start_index: 95 | saved_start_index = segment.start_line_index 96 | saved_last_segment = segment 97 | 98 | if saved_last_segment is None: 99 | return 100 | 101 | # adapt the last stop index of last segment 102 | saved_last_segment.stop_line_index = all_file_segments.number_of_lines-1 103 | saved_last_segment.stop_was_segmented = True # todo think about if this is necessary? 104 | 105 | 106 | 107 | 108 | 109 | def adapt_non_explicit_indices(self, all_file_segments): 110 | 111 | # update start and explicit stop tags first 112 | all_file_segments.correct_overlaps_index_field(only_start_tags=True) 113 | 114 | # fill undefined stop regions until next start region 115 | all_file_segments.fill_start_index_until_next_stop() 116 | 117 | 118 | class AllSegments(object): 119 | """ 120 | Accessor class for the segmentation of a file 121 | """ 122 | 123 | def __init__(self, number_of_lines, cpr, config): 124 | # init all internal-classification classes 125 | self.index_field = [] 126 | self.my_classes = [] 127 | self.my_only_indices = [] 128 | self.instantiate_classification_classes() 129 | self.number_of_lines = number_of_lines 130 | self.initialize_index_field(number_of_lines) 131 | self.cpr = cpr 132 | self.config = config 133 | self.get_only_classes() 134 | 135 | def get_only_classes(self): 136 | """ 137 | Get all classes which are tagged by the only flag 138 | :return: 139 | """ 140 | for segment_index, segment_class in enumerate(self.my_classes): 141 | if segment_class.only is True: 142 | self.my_only_indices.append(segment_index) 143 | 144 | if len(self.my_only_indices) >= 1: 145 | self.cpr.print("using only indices, since there is at least one class set to only") 146 | 147 | def initialize_index_field(self, number_of_lines): 148 | self.index_field = [] 149 | 150 | for ctr in range(0, number_of_lines): 151 | self.index_field.append(False) 152 | 153 | def correct_overlaps_index_field(self, only_start_tags=False): 154 | """ 155 | Debugging function to correct areas which are overlapping with stop taq the next start tag 156 | Attention: This reinitializes (overwrites) the existing index field 157 | :return: 158 | """ 159 | 160 | # reinitialize index field 161 | self.initialize_index_field(self.number_of_lines) 162 | 163 | # iterate classes - this not using only classes cause it's more for bigger sets of classes 164 | for segment_class_index, segment_class in enumerate(self.my_classes): 165 | if not segment_class.enabled: 166 | continue 167 | # todo check here ok ? 168 | self.update_index_field(segment_class, only_start_tags=True) 169 | 170 | if only_start_tags is True: 171 | return self 172 | 173 | # iterate again and update the stop tags in manner that they are only updated until the next start tag 174 | for segment_class_index, segment_class in enumerate(self.my_classes): 175 | if not segment_class.enabled: 176 | continue 177 | if not segment_class.is_start_segmented(): 178 | continue 179 | 180 | self.update_stop_tags(segment_class) 181 | 182 | 183 | return self 184 | 185 | def fill_start_index_until_next_stop(self): 186 | """ 187 | Fills all segments start to next segments stop, if they don't have explicitly defined stop tags 188 | Adapts index field and the segment stop properties 189 | :return: 190 | """ 191 | for segment_class_index, segment_class in enumerate(self.my_classes): 192 | if not segment_class.enabled: 193 | continue 194 | if segment_class.is_start_segmented() is False: 195 | # the segment wasn't found at all so no filling needed 196 | continue 197 | if segment_class.is_stop_segmented() is True: 198 | # class already has stop and therefore doesn't need to be filled 199 | continue 200 | 201 | # search until next found tag 202 | for index in range(segment_class.start_line_index+1, len(self.index_field)): 203 | current_field_item = self.index_field[index] 204 | if current_field_item is not False: 205 | # next item begins, done with filling 206 | segment_class.set_stop_segmented(index-1) # toggles stop_segmented, sets index 207 | break 208 | else: 209 | # field item is False, fill with the current segment tag 210 | self.index_field[index] = segment_class.segment_tag 211 | 212 | 213 | def update_index_field(self, segmentation_class, only_start_tags=False): 214 | segment_tag = segmentation_class.segment_tag 215 | start_line_index = segmentation_class.start_line_index 216 | stop_line_index = segmentation_class.stop_line_index 217 | 218 | # if no start condition set - no update 219 | if start_line_index == -1: 220 | return 221 | 222 | # if start condition but no endcondition just update 1st line 223 | if stop_line_index == -1: 224 | stop_line_index = start_line_index + 1 225 | 226 | # fix some index glitches 227 | if start_line_index > stop_line_index: 228 | stop_line_index = start_line_index 229 | 230 | if start_line_index == stop_line_index: 231 | stop_line_index = start_line_index + 1 232 | 233 | # special option for debugging purposes 234 | if only_start_tags is True: 235 | stop_line_index = start_line_index 236 | 237 | for index in range(start_line_index, stop_line_index+1): 238 | self.index_field[index] = segment_tag 239 | 240 | def update_stop_tags(self, segmentation_class): 241 | segment_tag = segmentation_class.segment_tag 242 | start_line_index = segmentation_class.start_line_index 243 | stop_line_index = segmentation_class.stop_line_index 244 | index_field_len = len(self.index_field) 245 | # if segment_tag is "Verwaltung": 246 | # print("aqd") 247 | 248 | for index in range(start_line_index+1, index_field_len): 249 | 250 | # update until the next defined field appeads 251 | if self.index_field[index] is not False: 252 | break 253 | 254 | self.index_field[index] = segment_tag 255 | 256 | def instantiate_classification_classes(self): 257 | dict_test = SegmentHolder.__dict__.items() 258 | 259 | for key, value in dict_test: 260 | if inspect.isclass(value): 261 | my_instance = value() 262 | self.my_classes.append(my_instance) 263 | 264 | def finish_segment_matching(self, lines, feats, file_info): 265 | """ 266 | Final step in segmentation, covers special segmentation cases which i.e. can be done 267 | after everything else is segmented. 268 | :param lines: 269 | :param feats: 270 | :param file_info: 271 | :return: 272 | """ 273 | 274 | # special case: in end match firmenname 275 | for segment_class_index, segment_class in enumerate(self.my_classes): 276 | if not isinstance(segment_class, SegmentHolder.SegmentFirmenname): 277 | continue # skip firmenname at firsthand, this will be matched in the end 278 | 279 | start_updated = segment_class.match_start_condition(lines, lines, self.index_field, feats, len(lines), file_info,None) 280 | 281 | start_updated = False # self.number_of_lines, prev_line, combined_line) 282 | if start_updated: 283 | # there was a change -> update the indices fields 284 | self.update_index_field(segment_class) 285 | 286 | break # this only occurs once 287 | 288 | 289 | # overall function for iterating over all matches 290 | def match_my_segments(self, line, line_text, line_index, features, prev_line, combined_line): 291 | 292 | # 'only'-tagged class usage 293 | using_only_classes = False 294 | if len(self.my_only_indices) >= 1: 295 | using_only_classes = True 296 | 297 | # iterate classes 298 | for segment_class_index, segment_class in enumerate(self.my_classes): 299 | if not segment_class.enabled: 300 | continue 301 | 302 | if using_only_classes: 303 | # if at least one class was tagged only, skip all other classes who are only tagged 304 | if segment_class_index not in self.my_only_indices: 305 | continue 306 | 307 | 308 | if isinstance(segment_class, SegmentHolder.SegmentFirmenname) : 309 | continue # skip firmenname at firsthand, this will be matched in the end 310 | 311 | 312 | start_updated = False 313 | stop_updated = False 314 | 315 | 316 | if self.config.REMATCH_START_CONDITION_UNTIL_ZERO_ERROR is True: 317 | # do segmenting until error rate of zero is reached 318 | start_error_number_before_match = segment_class.get_start_error_number() 319 | if not segment_class.is_start_segmented() or segment_class.get_start_error_number() >= 1: 320 | start_updated = segment_class.match_start_condition(line, line_text, line_index, features, 321 | self.number_of_lines, prev_line, combined_line) 322 | start_error_number_after_match = segment_class.get_start_error_number() 323 | if start_error_number_before_match <= start_error_number_after_match: 324 | # only update if the recognized number is lower 325 | start_updated = False 326 | 327 | stop_error_number_before_match = segment_class.get_stop_error_number() 328 | if not segment_class.is_stop_segmented() or segment_class.get_stop_error_number() >= 1: 329 | stop_updated = segment_class.match_stop_condition(line, line_text, line_index, features, 330 | self.number_of_lines, prev_line, combined_line) 331 | stop_error_number_after_match = segment_class.get_stop_error_number() 332 | if stop_error_number_before_match <= stop_error_number_after_match: 333 | # only update if the recognized number is lower 334 | stop_updated = False 335 | 336 | else: 337 | # just hit the first match and stop matching then -> standard mode 338 | if not segment_class.is_start_segmented(): 339 | start_updated = segment_class.match_start_condition(line, line_text, line_index, features, 340 | self.number_of_lines, prev_line, combined_line) 341 | if not segment_class.is_stop_segmented(): 342 | stop_updated = segment_class.match_stop_condition(line, line_text, line_index, features, 343 | self.number_of_lines, prev_line, combined_line) 344 | 345 | if start_updated or stop_updated: 346 | 347 | if stop_updated: 348 | start_line_index = segment_class.start_line_index 349 | stop_line_index = segment_class.stop_line_index 350 | for segment in self.my_classes: 351 | if type(segment) == type(segment_class): 352 | continue 353 | current_start_line_index = segment.start_line_index 354 | current_stop_line_index = segment.stop_line_index 355 | 356 | if current_start_line_index != -1 and (current_start_line_index >= start_line_index and current_start_line_index <=stop_line_index): 357 | segment.set_start_segmented(-1) 358 | segment.start_was_segmented = False 359 | if current_stop_line_index != -1 and (current_stop_line_index >= start_line_index and current_stop_line_index <=stop_line_index): 360 | segment.set_stop_segmented(-1) 361 | segment.stop_was_segmented = False 362 | 363 | 364 | # there was a change -> update the indices fields 365 | self.update_index_field(segment_class) 366 | 367 | 368 | 369 | -------------------------------------------------------------------------------- /.idea/dbnavigator.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | -------------------------------------------------------------------------------- /lib/akf_parsing_functions_one.py: -------------------------------------------------------------------------------- 1 | from akf_corelib.conditional_print import ConditionalPrint 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | from .data_helper import DataHelper as dh 4 | from .akf_parsing_functions_common import AKFCommonParsingFunctions as cf 5 | 6 | import regex 7 | 8 | 9 | class AkfParsingFunctionsOne(object): 10 | 11 | def __init__(self, endobject_factory, output_analyzer, dictionary_handler): 12 | config_handler = ConfigurationHandler(first_init=False) 13 | 14 | self.config = config_handler.get_config() 15 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_ONE, self.config.PRINT_EXCEPTION_LEVEL, 16 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) 17 | 18 | self.cpr.print("init akf parsing functions one") 19 | 20 | self.ef = endobject_factory 21 | self.output_analyzer = output_analyzer 22 | self.dictionary_handler = dictionary_handler 23 | 24 | 25 | def parse_firmenname(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 26 | # get basic data 27 | element_counter = 0 28 | 29 | origpost, origpost_red, element_counter, content_texts = \ 30 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 31 | 32 | # get relevant info 33 | accumulated_text = "" 34 | for text in content_texts: 35 | accumulated_text += " " + text 36 | 37 | only_add_if_value = False 38 | accumulated_text = accumulated_text.strip() 39 | self.ef.add_to_my_obj("Firmenname", accumulated_text, object_number=element_counter, only_filled=only_add_if_value) 40 | 41 | 42 | def parse_sitz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 43 | """ 44 | "Sitz": [ 45 | { 46 | "origpost": "Mergenthalerallee 79-81, 65760 Eschborn Telefon:(069) 7 50 06-0 Telefax:(069) 7 50 06-111 e-mail:info@3u.net Internetseite:http://www.3u.net ", 47 | "type": "Sitz", 48 | "street": "Mergenthalerallee", 49 | "street_number": "79-81", 50 | "zip": "65760", 51 | "city": "Eschborn", 52 | "phone": "(069) 7 50 06-0", 53 | "fax": "(069) 7 50 06-111", 54 | "email": [ 55 | "info@3u.net" 56 | ], 57 | "www": [ 58 | "http://www.3u.net" 59 | ] 60 | } 61 | ], 62 | """ 63 | # get basic data 64 | element_counter = 0 65 | 66 | origpost, origpost_red, element_counter, content_texts = \ 67 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 68 | 69 | # get relevant info 70 | num_id, city, street, street_number, additional_info = cf.parse_id_location(origpost_red) 71 | 72 | # add stuff to ef 73 | only_add_if_value = True 74 | self.ef.add_to_my_obj("numID", num_id, object_number=element_counter, only_filled= only_add_if_value) 75 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled= only_add_if_value) 76 | self.ef.add_to_my_obj("street", street, object_number=element_counter, only_filled= only_add_if_value) 77 | self.ef.add_to_my_obj("street_number", street_number, object_number=element_counter, only_filled= only_add_if_value) 78 | self.ef.add_to_my_obj("additional_info", additional_info, object_number=element_counter, only_filled= only_add_if_value) 79 | 80 | return True 81 | 82 | def parse_verwaltung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 83 | # kmy_obj_2 = self.ef.print_me_and_return() 84 | # get basic data 85 | element_counter = 0 86 | origpost, origpost_red, element_counter, content_texts = \ 87 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 88 | 89 | # logme 90 | # self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) 91 | 92 | if "srat" in real_start_tag: 93 | # Verwaltungsrat .. 94 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler, 95 | self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) 96 | only_add_if_filed = True 97 | for entry in persons_final: 98 | name, first_name, last_name, city, title, funct, rest_info = entry 99 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) 100 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, 101 | only_filled=only_add_if_filed) 102 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, 103 | only_filled=only_add_if_filed) 104 | 105 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) 106 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) 107 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) 108 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) 109 | 110 | element_counter += 1 111 | return True 112 | elif "Verw." in real_start_tag: 113 | # Verw. 114 | num_id, city, street, street_number, additional_info = cf.parse_id_location(origpost_red) 115 | 116 | # add stuff to ef 117 | only_add_if_value = True 118 | self.ef.add_to_my_obj("numID", num_id, object_number=element_counter, only_filled=only_add_if_value) 119 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value) 120 | self.ef.add_to_my_obj("street", street, object_number=element_counter, only_filled=only_add_if_value) 121 | self.ef.add_to_my_obj("street_number", street_number, object_number=element_counter, 122 | only_filled=only_add_if_value) 123 | self.ef.add_to_my_obj("additional_info", additional_info, object_number=element_counter, 124 | only_filled=only_add_if_value) 125 | 126 | return True 127 | else: 128 | # Verwaltung 129 | final_items = cf.parse_general_and_keys(content_texts, 130 | join_separated_lines=False, 131 | current_key_initial_value="General_Info") 132 | for key in final_items.keys(): 133 | value = final_items[key] 134 | if value is None or value == "": 135 | continue 136 | self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=True) 137 | element_counter += 1 138 | return True 139 | 140 | def parse_telefon_fernruf(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 141 | 142 | # get basic data 143 | origpost, origpost_red, element_counter, content_texts = cf.add_check_element(self, content_texts, 144 | real_start_tag, segmentation_class, 0) 145 | # do special match: Verwaltung und Betriebshof 146 | split_post = [] 147 | 148 | match_special = regex.match(r"(?Verwaltung.*)" 149 | r"(?Betriebshof.*)" 150 | , origpost_red) 151 | if match_special: 152 | betriebshof = match_special.group("Betr") 153 | verwaltung = match_special.group("Verw") 154 | origpost_red = origpost_red.replace(betriebshof, "") 155 | origpost_red = origpost_red.replace(verwaltung, "") 156 | split_post.append(betriebshof) 157 | split_post.append(verwaltung) 158 | # do special match: Ortsgespräche and Ferngespräche 159 | 160 | match_special2 = regex.match(r"(?Ortsgespräche.*)" 161 | r"(?Ferngespräche.*)" 162 | , origpost_red) 163 | if match_special2: 164 | ortsgespr = match_special2.group("og") 165 | ferngespr = match_special2.group("fg") 166 | origpost_red = origpost_red.replace(ortsgespr, "") 167 | origpost_red = origpost_red.replace(ferngespr, "") 168 | split_post.append(ortsgespr) 169 | split_post.append(ferngespr) 170 | 171 | 172 | 173 | # do special match: Ortsverkehr and Fernverkehr 174 | 175 | match_special3 = regex.match(r"(?Ortsverkehr.*)" 176 | r"(?Fernverkehr.*)" 177 | , origpost_red) 178 | if match_special3: 179 | ortsverkehr = match_special3.group("ov") 180 | fernverkehr = match_special3.group("fv") 181 | origpost_red = origpost_red.replace(ortsverkehr, "") 182 | origpost_red = origpost_red.replace(fernverkehr, "") 183 | split_post.append(ortsverkehr) 184 | split_post.append(fernverkehr) 185 | 186 | # do special match: check if only numbers 187 | origpost_red_new = origpost_red 188 | #only_num_check = origpost_red.replace("und", "").replace(",", "").replace(" ", "") 189 | test_split = regex.split("\su\.|\sund\s|,|;", origpost_red) 190 | for number in test_split: 191 | # additional parenthesis block 192 | match_parenthesis = regex.search("\(.*\)", number) 193 | parenthesis = None 194 | if match_parenthesis: 195 | parenthesis = match_parenthesis.group() 196 | number = number.replace(parenthesis,"") # remove number 197 | self.ef.add_to_my_obj("vorwahl", parenthesis, object_number=element_counter, only_filled=True) 198 | 199 | 200 | match_word_num = regex.search("(?[^\d]*)(?[\d\s\-/]*)", number) 201 | if match_word_num is None: 202 | continue 203 | 204 | word = match_word_num.group("word") 205 | num = match_word_num.group("num") 206 | if "Sa." in word and "Nr" in word: 207 | continue 208 | number_stripped = num.strip(" ./").replace("/", "").replace("-", "").replace(" ", "") 209 | if number_stripped.isdigit(): 210 | origpost_red_new = origpost_red_new.replace(number, "") # remove number 211 | origpost_red_new = origpost_red_new.replace(word, "") # remove word found 212 | 213 | change1 = self.ef.add_to_my_obj("number_Sa.-Nr.", num.strip(), object_number=element_counter, only_filled=True) 214 | change2 = self.ef.add_to_my_obj("location", word.strip(), object_number=element_counter, only_filled=True) 215 | if change1 or change2: 216 | element_counter += 1 217 | 218 | #if "32 20 47" in origpost_red: 219 | # print("asd") 220 | 221 | origpost_red = origpost_red_new 222 | # substitute in a separator char to integrate delimiters in next step 223 | origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red) 224 | 225 | # do further matches (sc-separated) 226 | split_post.extend(regex.split(';|~~~~|\su\.', origpost_red)) 227 | 228 | for index, entry in enumerate(split_post): 229 | if entry is None: 230 | continue 231 | entry_stripped = entry.strip() 232 | if entry_stripped == "": 233 | continue 234 | 235 | # additional parenthesis block 236 | match_parenthesis = regex.search("\(.*\)", entry_stripped) 237 | parenthesis = None 238 | if match_parenthesis: 239 | parenthesis = match_parenthesis.group() 240 | entry_stripped = entry_stripped.replace(parenthesis, "") # remove entry 241 | self.ef.add_to_my_obj("vorwahl", parenthesis, object_number=element_counter, only_filled=True) 242 | 243 | 244 | 245 | match_word = regex.match(r"(?\D*)" 246 | r"(?[\d\s\W]*)" 247 | ,entry_stripped) 248 | if match_word is not None: 249 | # fetch match results 250 | tag_match = match_word.group("Tag") 251 | numbers_match = match_word.group("Numbers") 252 | rest_from_entry_str = entry_stripped.replace(tag_match, "", 1) 253 | rest_from_entry_str = rest_from_entry_str.replace(numbers_match, "", 1) 254 | 255 | tag = dh.strip_if_not_none(tag_match, "") 256 | match_tag = regex.match(r"(?.*)(?Sa\.?\-Nr\.?)(?.*)", tag) 257 | location = "" 258 | if match_tag is not None: 259 | rest_tag = match_tag.group('rest_bef') 260 | rest_tag_2 = match_tag.group('rest_end') 261 | # sanr = match_tag.group('sanr') # this is the filtered group 262 | location = dh.strip_if_not_none(rest_tag + " " + rest_tag_2, ":., ") 263 | else: 264 | # if there are no real descriptors in tag then tag is usually location (like Düsseldorf 1 36 62.) 265 | location = tag 266 | 267 | if "und" in location: 268 | location = regex.sub("[^\w]und[^\w]", "", location) 269 | 270 | number = dh.strip_if_not_none(numbers_match, "., ") 271 | self.ef.add_to_my_obj("number_Sa.-Nr.", number.strip(), object_number=element_counter, only_filled=True) 272 | self.ef.add_to_my_obj("location", location.strip(), object_number=element_counter, only_filled=True) 273 | additional_info_entry_level = dh.strip_if_not_none(rest_from_entry_str, ",. ") 274 | self.ef.add_to_my_obj("additional_info", additional_info_entry_level.strip(), 275 | object_number=element_counter, only_filled=True) 276 | element_counter += 1 277 | 278 | origpost_red = origpost_red.replace(number, "", 1) 279 | origpost_red = origpost_red.replace(location, "", 1) 280 | 281 | origpost_red = origpost_red.replace("Sa.-Nr", "").replace("~~~~", "") 282 | origpost_red_end = dh.remove_multiple_outbound_chars(origpost_red) 283 | 284 | if len(origpost_red_end) > 3: 285 | self.ef.add_to_my_obj("additional_info_unparsed", origpost_red_end.strip(), object_number=element_counter) 286 | 287 | def parse_vorstand(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 288 | 289 | # get basic data 290 | element_counter = 0 291 | origpost, origpost_red, element_counter, content_texts = \ 292 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 293 | 294 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler, 295 | self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) 296 | 297 | only_add_if_filed = True 298 | for entry in persons_final: 299 | name, first_name, last_name, city, title, funct, rest_info = entry 300 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) 301 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, 302 | only_filled=only_add_if_filed) 303 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed) 304 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) 305 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) 306 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) 307 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) 308 | element_counter += 1 309 | """ 310 | # do matches (;-separated) 311 | split_post = origpost_red.split(';') 312 | 313 | for index, entry in enumerate(split_post): 314 | entry_stripped = entry.strip() 315 | 316 | if index == len(split_post)-1: 317 | matchend = regex.match("^[Aa]lle", entry_stripped) 318 | if matchend: 319 | self.ef.add_to_my_obj("additional_info", entry_stripped, object_number=element_counter) 320 | element_counter += 1 321 | continue 322 | 323 | match = regex.match(r"(?.*)[,]" # find location string 324 | r"(?.*+)", # just get the rest which is usually streetname and number, but has other possibilities 325 | entry_stripped) 326 | if match is None: 327 | name = dh.strip_if_not_none(entry_stripped, ", ") 328 | self.ef.add_to_my_obj("name", name, object_number=element_counter) 329 | element_counter += 1 330 | continue 331 | 332 | name = dh.strip_if_not_none(match.group("Name"), ", ") 333 | rest = dh.strip_if_not_none(match.group("Rest"), ",. ") 334 | name_split = name.split(',') 335 | if len(name_split) > 1: 336 | position = rest 337 | name = name_split[0] 338 | city = name_split[1] 339 | else: 340 | city = rest 341 | position = "" 342 | 343 | self.ef.add_to_my_obj("name", name, object_number=element_counter) 344 | self.ef.add_to_my_obj("city", city, object_number=element_counter) 345 | self.ef.add_to_my_obj("position", position, object_number=element_counter) 346 | element_counter += 1 347 | """ 348 | 349 | return True 350 | 351 | def parse_aufsichtsrat(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 352 | 353 | # get basic data 354 | element_counter = 0 355 | origpost, origpost_red, element_counter, content_texts = \ 356 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 357 | 358 | #Try to fix +) problems 359 | origpost_red = origpost_red.replace("; +)","+);").replace(";+)","+);").replace("')","").replace("*)","") 360 | 361 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) 362 | 363 | only_add_if_filed = True 364 | for entry in persons_final: 365 | name, first_name, last_name, city, title, funct, rest_info = entry 366 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled= only_add_if_filed) 367 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled= only_add_if_filed) 368 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled= only_add_if_filed) 369 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled= only_add_if_filed) 370 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled= only_add_if_filed) 371 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled= only_add_if_filed) 372 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) 373 | element_counter += 1 374 | 375 | 376 | return True 377 | 378 | def parse_arbeitnehmervertreter(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 379 | # get basic data 380 | element_counter = 0 381 | origpost, origpost_red, element_counter, content_texts = \ 382 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 383 | 384 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) 385 | only_add_if_filed = True 386 | for entry in persons_final: 387 | name, first_name, last_name, city, title, funct, rest_info = entry 388 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled= only_add_if_filed) 389 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled= only_add_if_filed) 390 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled= only_add_if_filed) 391 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled= only_add_if_filed) 392 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled= only_add_if_filed) 393 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled= only_add_if_filed) 394 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) 395 | 396 | element_counter += 1 397 | 398 | return True 399 | 400 | # Gruendung 401 | def parse_gruendung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 402 | # get basic data 403 | element_counter = 0 404 | origpost, origpost_red, element_counter, content_texts = \ 405 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 406 | match_year = regex.search("^\d*", origpost_red.strip()) 407 | if match_year: 408 | result = match_year.group() 409 | origpost_red_new = origpost_red.replace(result, "", 1) 410 | year = dh.strip_if_not_none(result, ".,() ") 411 | rest_info = dh.strip_if_not_none(origpost_red_new, ".,() ") 412 | self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=True) 413 | self.ef.add_to_my_obj("year", year, object_number=element_counter, only_filled=True) 414 | else: 415 | rest_info = dh.strip_if_not_none(origpost_red, ".,() ") 416 | self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=True) 417 | 418 | # Tätigkeitsgebiet 419 | def parse_taetigkeitsgebiet(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 420 | # get basic data 421 | element_counter = 0 422 | origpost, origpost_red, element_counter, content_texts = \ 423 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 424 | 425 | final_items = cf.parse_general_and_keys(content_texts, 426 | join_separated_lines=False, 427 | current_key_initial_value="General_Info") 428 | 429 | for key in final_items.keys(): 430 | value = final_items[key] 431 | if value is None or len(value) == 0: 432 | continue 433 | self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=True) 434 | element_counter += 1 --------------------------------------------------------------------------------