OCR Results

49 |
50 | 51 | 52 |
53 | Überlandwerk 54 | Unterfranken 55 | Aktiengesellschatt 56 | 57 | = 1 and text[len_text-1] == ":": 31 | text = text[0:len_text-1] 32 | 33 | return text 34 | 35 | 36 | @staticmethod 37 | def get_rest_content_start_line(segmentation_class, start_line, trim=True): 38 | text = start_line['text'] 39 | stop = segmentation_class.key_tag_cindex_stop 40 | rest_start = text[stop:] 41 | if trim: 42 | rest_start = rest_start.strip() 43 | return rest_start 44 | 45 | @staticmethod 46 | def remove_multiple_outbound_chars(text): 47 | """ 48 | Strips the left and the right side of special characters in a string 49 | and returns the stripped version then: 50 | example ".;my text is;,,," returns "my text is" 51 | :param text: input text 52 | :return: filtered text 53 | """ 54 | # print("input:", text) 55 | 56 | text_to_change = text 57 | 58 | # filter left side 59 | match_l = regex.search(r"^[^\w\s]*(?.*)", text_to_change) 60 | if match_l: 61 | rest = match_l.group("tag") 62 | text_to_change = rest 63 | 64 | if text_to_change == "": 65 | return text_to_change 66 | 67 | # filter right side 68 | match_r2 = regex.search(r"(?P[^\w\s]*)$", text_to_change) 69 | 70 | if match_r2: 71 | rest = match_r2.group("right_rest") 72 | text_to_change = DataHelper.rreplace(text_to_change, rest) 73 | 74 | # print("output:", text_to_change) 75 | return text_to_change 76 | 77 | @staticmethod 78 | def rreplace(text, replace_text): 79 | """ 80 | Replace text from the right hand side of a string 81 | by reversing the strings 82 | :param text: input text 83 | :return: filtered text 84 | """ 85 | reverse_text = text[::-1] 86 | reverse_replace_text = replace_text[::-1] 87 | new_reverse_text = reverse_text.replace(reverse_replace_text, "") 88 | new_text = new_reverse_text[::-1].strip() 89 | 90 | return new_text 91 | 92 | 93 | @staticmethod 94 | def get_content(segment_lines, feature_lines, segmentation_class): 95 | start_index = segmentation_class.get_start_line_index() 96 | stop_index = segmentation_class.get_stop_line_index() 97 | selected_start_line = segment_lines[start_index] 98 | feature_start_line = feature_lines[start_index] 99 | real_tag = DataHelper.get_real_tag_from_segment(segmentation_class, selected_start_line) 100 | rest_content_start_line = DataHelper.get_rest_content_start_line(segmentation_class, selected_start_line) 101 | 102 | # if there are no further line, return obtained content 103 | if start_index == stop_index: 104 | return real_tag, [rest_content_start_line], [selected_start_line], [feature_start_line] 105 | 106 | # otherwise fetch the rest of the content 107 | other_rest_content_texts = [] 108 | other_rest_content_lines = [] 109 | other_rest_feature_lines = [] 110 | 111 | other_rest_content_texts.append(rest_content_start_line) 112 | other_rest_content_lines.append(selected_start_line) 113 | other_rest_feature_lines.append(feature_start_line) 114 | 115 | for current_index in range(start_index+1, stop_index+1): 116 | current_line = segment_lines[current_index] 117 | current_feature_lines = feature_lines[current_index] 118 | other_rest_content_texts.append(current_line['text']) 119 | other_rest_content_lines.append(current_line) 120 | other_rest_feature_lines.append(current_feature_lines) 121 | 122 | return real_tag, other_rest_content_texts, other_rest_content_lines, other_rest_feature_lines 123 | 124 | 125 | @staticmethod 126 | def write_array_to_root_simple(base_path, tag, text_lines, analysis_root, append_mode=False): 127 | full_dir = analysis_root + base_path + "/" 128 | full_path = full_dir + tag + ".txt" 129 | 130 | fh.create_directory_tree(full_dir) 131 | # write append or normal 132 | if append_mode is True: 133 | my_file = io.open(full_path, 'a', encoding='utf8') 134 | else: 135 | my_file = io.open(full_path, 'w', encoding='utf8') 136 | 137 | for text_line in text_lines: 138 | my_file.write(text_line+"\n") 139 | 140 | my_file.close() 141 | 142 | @staticmethod 143 | def write_array_to_root(base_path, text_lines, ocromore_data, analysis_root, accumulated=False): 144 | """ 145 | Writes a line-array to the base path in root path with ocromore data file and db name 146 | :param base_path: 147 | :param text_lines: 148 | :param ocromore_data: 149 | :param analysis_root: root path in base directory 150 | :param accumulated: file is accumulated file naming different 151 | :return: 152 | """ 153 | 154 | dbpath = ocromore_data['file_info'].dbpath 155 | tablename = ocromore_data['file_info'].tablename 156 | 157 | full_dir = analysis_root + base_path + dbpath+"/" 158 | if accumulated is False: 159 | full_path = full_dir + tablename + ".txt" 160 | else: 161 | full_path = full_dir +"accumulated_report"+".txt" 162 | 163 | fh.create_directory_tree(full_dir) 164 | 165 | my_file = io.open(full_path, 'w', encoding='utf8') 166 | 167 | for text_line in text_lines: 168 | my_file.write(text_line+"\n") 169 | 170 | my_file.close() 171 | 172 | @staticmethod 173 | def create_stringified_linearray(array_of_texts): 174 | final_string = "" 175 | for line_text in array_of_texts: 176 | final_string += line_text+"\n" 177 | 178 | final_string = final_string.strip() 179 | return final_string, final_string.replace("\n", " ") 180 | 181 | @staticmethod 182 | def strip_if_not_none(text, strip_pattern): 183 | if text is None: 184 | return text 185 | else: 186 | if strip_pattern != "": 187 | return text.strip(strip_pattern) 188 | else: 189 | return text.strip() 190 | 191 | @staticmethod 192 | def join_joined_lines(joined_texts, add_spaces=True): 193 | """ 194 | Takes the output from 'join_separated_lines' and joins the lines to one 195 | string 196 | :param joined_texts: array of texts 197 | :param add_spaces: add a space between joined texts 198 | :return: joined string 199 | """ 200 | return_text = "" 201 | 202 | for text in joined_texts: 203 | if add_spaces is True: 204 | return_text += " "+text 205 | else: 206 | return_text += text 207 | 208 | return_text = return_text.strip() 209 | 210 | return return_text 211 | 212 | 213 | @staticmethod 214 | def join_separated_lines(content_texts): 215 | """ 216 | Joins dash separated lines in the text list (reduces the number of entries, if 217 | there are such lines) 218 | :param content_texts: text list to join 219 | :return: text array where all dash separated lines are joined 220 | """ 221 | 222 | # final array with joined texts 223 | joined_texts = [] 224 | # intermediate array for storing tagged lines (normal line:0 or separator_line:1) 225 | NORMAL_LINE = 0 226 | SEPARATOR_LINE = 1 227 | LAST_LINE = 2 228 | 229 | tagged_texts = [] 230 | 231 | len_content_texts = len(content_texts) 232 | 233 | #if len_content_texts == 42: 234 | # print("asd") 235 | 236 | # iterate the given texts 237 | for text_index, text in enumerate(content_texts): 238 | if text is None: 239 | continue 240 | #if "Kommanditeinlagen" in text: 241 | # print("asd") 242 | 243 | # if there is one, get the follow up text 244 | next_text = None 245 | if text_index < len_content_texts - 1: 246 | next_text = content_texts[text_index + 1].strip() 247 | 248 | # detect line with separator 249 | if (len(text) >= 2 and "-" in text[-1]): 250 | line_ends_with_amount = False 251 | 252 | # this is a line which ends with a amount indicator like '6 500 000. -' 253 | # and therefore no separator 254 | if len(text) >= 3 and "-" in text[-1] and " " in text[-2] and "." in text[-3]: 255 | line_ends_with_amount = True 256 | elif len(text) >= 2 and "-" in text[-1] and "." in text[-2]: 257 | line_ends_with_amount = True 258 | elif len(text) >= 2 and "-" in text[-1] and text[-2].isdigit(): 259 | line_ends_with_amount = True # no amount, but similar case it's a timespan '1996-\n1997' or similar 260 | 261 | if not line_ends_with_amount and next_text is not None and len(next_text) >= 1: 262 | 263 | # if the next starting letter is uppercase don't do the joining (assuming it's a '-' 264 | # separated Name like "Jan-Phillipp") 265 | if not next_text[0].isupper(): 266 | # fetch the next text in current and remove separator 267 | text = text[0:len(text) - 1] 268 | # store in tagged texts 269 | tagged_texts.append((text, SEPARATOR_LINE)) 270 | continue 271 | 272 | if text_index >= len_content_texts: 273 | tagged_texts.append((text, LAST_LINE)) 274 | break 275 | 276 | # append to tagged texts 277 | tagged_texts.append((text, NORMAL_LINE)) 278 | 279 | # join the tagged texts 280 | 281 | for current_index, ttext_info in enumerate(tagged_texts): 282 | if ttext_info == None: 283 | continue # line was already joined 284 | 285 | current_ttext, current_id = ttext_info 286 | if current_id == NORMAL_LINE: 287 | joined_texts.append(current_ttext) 288 | elif current_id == SEPARATOR_LINE: 289 | # check all follow up lines 290 | for follow_up_index in range(current_index+1, len(tagged_texts)): 291 | follow_ttext, follow_id = tagged_texts[follow_up_index] 292 | current_ttext = current_ttext + follow_ttext 293 | tagged_texts[follow_up_index] = None 294 | if follow_id == NORMAL_LINE or follow_id == LAST_LINE: 295 | # update my new array 296 | joined_texts.append(current_ttext) 297 | break # done escape the inner loop 298 | elif follow_id == SEPARATOR_LINE: 299 | continue # continue inner loop 300 | 301 | # return the modified list 302 | return joined_texts 303 | 304 | @staticmethod 305 | def join_separated_lines_parenthesis(content_texts): 306 | next_lines_is_ending_parenthesis = False # indicator - 307 | next_closing_ordinal = -1 # indicator - the n-th closing parenthesis closes the previous block 308 | change = False 309 | final_entries = [] 310 | 311 | len_content_texts = len(content_texts) 312 | for text_index, text in enumerate(content_texts): 313 | 314 | # if there was a case detect add this line to the previous one instead of appending as new line 315 | if next_lines_is_ending_parenthesis: 316 | 317 | text_split = text.split(')') 318 | text_to_add = "" 319 | rest_text = "" 320 | # define next closing ordinal, sometimes overflow todo this is not 100% accurate 321 | used_closing_ordinal = 0 322 | if next_closing_ordinal > 0: 323 | used_closing_ordinal = next_closing_ordinal 324 | for tf_index, text_fragment in enumerate(text_split): 325 | if tf_index <= used_closing_ordinal: 326 | text_to_add += " " + text_fragment+")" 327 | else: 328 | if text_fragment.strip != "": 329 | # only add delimiters if not at end of split 330 | if tf_index == len(text_split)-1: 331 | rest_text += " " + text_fragment 332 | else: 333 | rest_text += " " + text_fragment+")" 334 | 335 | final_entries[-1] += " " + text_to_add.strip() # add until parenthesis end then go on 336 | next_lines_is_ending_parenthesis = False 337 | change = True # change debugging indicator 338 | # change current text to only rest 339 | text = rest_text.strip() 340 | #print(final_entries) 341 | if text == ")": 342 | continue 343 | 344 | # check if there is more opening parenthesis 345 | opening_parenthesis = text.count("(") 346 | closing_parenthesis = text.count(")") 347 | 348 | if opening_parenthesis <= closing_parenthesis: 349 | final_entries.append(text) 350 | continue 351 | 352 | # assign next text otherwise continue 353 | next_text = None 354 | if text_index+1 < len_content_texts: 355 | next_text = content_texts[text_index + 1] 356 | else: 357 | final_entries.append(text) 358 | continue 359 | 360 | next_opening_parentesis = next_text.count("(") 361 | next_closing_parenthesis = next_text.count(")") 362 | 363 | if next_closing_parenthesis == 0: 364 | final_entries.append(text) 365 | continue 366 | 367 | # if code ran until here the lines are a concat case 368 | final_entries.append(text) 369 | next_lines_is_ending_parenthesis = True 370 | next_closing_ordinal = opening_parenthesis-closing_parenthesis - next_closing_parenthesis 371 | 372 | #if change: 373 | # print("debug") 374 | 375 | return final_entries 376 | 377 | @staticmethod 378 | def filter_special_chars(text, remove_spaces=True): 379 | """ 380 | Remove special characters from input text 381 | :param text: input text 382 | :param remove_spaces: if true also removes spaces 383 | :return: filtered text 384 | """ 385 | 386 | if remove_spaces: 387 | text_filtered = re.sub('[^A-Za-z0-9]+', '', text) 388 | else: 389 | text_filtered = re.sub('[^A-Za-z0-9\s]+', '', text) 390 | 391 | return text_filtered -------------------------------------------------------------------------------- /lib/segment_classifier.py: -------------------------------------------------------------------------------- 1 | from akf_corelib.conditional_print import ConditionalPrint 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | from lib.akf_segment_holder import SegmentHolder 4 | from lib.data_helper import DataHelper as dh 5 | import inspect 6 | 7 | class SegmentClassifier(object): 8 | """ 9 | This is the basic handler for classification 10 | which get's accessed from root/-outside classes. 11 | """ 12 | 13 | def __init__(self): 14 | 15 | config_handler = ConfigurationHandler(first_init=False) 16 | 17 | self.config = config_handler.get_config() 18 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_CLASSIFIER, self.config.PRINT_EXCEPTION_LEVEL, 19 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) 20 | self.cpr.print("init segment classifier") 21 | 22 | def classify_file_segments(self, ocromore_data): 23 | lines = ocromore_data['lines'] 24 | feats = ocromore_data['line_features'] 25 | file_info = ocromore_data['file_info'] 26 | all_file_segments = AllSegments(len(lines), self.cpr, self.config) 27 | 28 | prev_line = None 29 | prev_text = None 30 | for current_line_index, current_line in enumerate(lines): 31 | current_features = feats[current_line_index] 32 | current_text = current_line['text'] 33 | current_index = current_line['line_index'] 34 | # create a combined lined object with optimized (removed) separation 35 | combined_line = None 36 | if prev_line is not None: 37 | combined_lines = dh.join_separated_lines([prev_text, current_text]) 38 | combined_line = dh.join_joined_lines(combined_lines) 39 | else: 40 | combined_line = current_text 41 | # pass parameters to matching functions 42 | all_file_segments.match_my_segments(current_line, current_text, current_index, current_features, 43 | prev_line, combined_line) 44 | prev_line = current_line 45 | prev_text = current_text 46 | 47 | 48 | 49 | 50 | if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION: 51 | self.adapt_non_explicit_indices(all_file_segments) 52 | else: 53 | all_file_segments.correct_overlaps_index_field(only_start_tags=True) 54 | 55 | self.adapt_stop_index_in_last_segment(all_file_segments) 56 | 57 | 58 | # does the last steps in segment matching 59 | all_file_segments.finish_segment_matching(lines, feats, file_info) 60 | 61 | # do again after final step 62 | if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION: 63 | self.adapt_non_explicit_indices(all_file_segments) 64 | else: 65 | all_file_segments.correct_overlaps_index_field(only_start_tags=True) 66 | 67 | self.adapt_stop_index_in_last_segment(all_file_segments) 68 | 69 | 70 | 71 | 72 | ocromore_data['segmentation'] = all_file_segments 73 | 74 | return ocromore_data 75 | 76 | 77 | def adapt_stop_index_in_last_segment(self, all_file_segments): 78 | """ 79 | Sets the stop_index for the last recognized segment, which 80 | is a special case and is usually not filled beforehand, because 81 | there is no next start index 82 | :param all_file_segments: holder object for segment classes and other info 83 | :return: None 84 | """ 85 | 86 | # search for last segment 87 | saved_start_index = -1 88 | saved_last_segment = None 89 | for segment in all_file_segments.my_classes: 90 | # only count segmented segments 91 | if segment.start_was_segmented is False: 92 | continue 93 | 94 | if segment.start_line_index >= saved_start_index: 95 | saved_start_index = segment.start_line_index 96 | saved_last_segment = segment 97 | 98 | if saved_last_segment is None: 99 | return 100 | 101 | # adapt the last stop index of last segment 102 | saved_last_segment.stop_line_index = all_file_segments.number_of_lines-1 103 | saved_last_segment.stop_was_segmented = True # todo think about if this is necessary? 104 | 105 | 106 | 107 | 108 | 109 | def adapt_non_explicit_indices(self, all_file_segments): 110 | 111 | # update start and explicit stop tags first 112 | all_file_segments.correct_overlaps_index_field(only_start_tags=True) 113 | 114 | # fill undefined stop regions until next start region 115 | all_file_segments.fill_start_index_until_next_stop() 116 | 117 | 118 | class AllSegments(object): 119 | """ 120 | Accessor class for the segmentation of a file 121 | """ 122 | 123 | def __init__(self, number_of_lines, cpr, config): 124 | # init all internal-classification classes 125 | self.index_field = [] 126 | self.my_classes = [] 127 | self.my_only_indices = [] 128 | self.instantiate_classification_classes() 129 | self.number_of_lines = number_of_lines 130 | self.initialize_index_field(number_of_lines) 131 | self.cpr = cpr 132 | self.config = config 133 | self.get_only_classes() 134 | 135 | def get_only_classes(self): 136 | """ 137 | Get all classes which are tagged by the only flag 138 | :return: 139 | """ 140 | for segment_index, segment_class in enumerate(self.my_classes): 141 | if segment_class.only is True: 142 | self.my_only_indices.append(segment_index) 143 | 144 | if len(self.my_only_indices) >= 1: 145 | self.cpr.print("using only indices, since there is at least one class set to only") 146 | 147 | def initialize_index_field(self, number_of_lines): 148 | self.index_field = [] 149 | 150 | for ctr in range(0, number_of_lines): 151 | self.index_field.append(False) 152 | 153 | def correct_overlaps_index_field(self, only_start_tags=False): 154 | """ 155 | Debugging function to correct areas which are overlapping with stop taq the next start tag 156 | Attention: This reinitializes (overwrites) the existing index field 157 | :return: 158 | """ 159 | 160 | # reinitialize index field 161 | self.initialize_index_field(self.number_of_lines) 162 | 163 | # iterate classes - this not using only classes cause it's more for bigger sets of classes 164 | for segment_class_index, segment_class in enumerate(self.my_classes): 165 | if not segment_class.enabled: 166 | continue 167 | # todo check here ok ? 168 | self.update_index_field(segment_class, only_start_tags=True) 169 | 170 | if only_start_tags is True: 171 | return self 172 | 173 | # iterate again and update the stop tags in manner that they are only updated until the next start tag 174 | for segment_class_index, segment_class in enumerate(self.my_classes): 175 | if not segment_class.enabled: 176 | continue 177 | if not segment_class.is_start_segmented(): 178 | continue 179 | 180 | self.update_stop_tags(segment_class) 181 | 182 | 183 | return self 184 | 185 | def fill_start_index_until_next_stop(self): 186 | """ 187 | Fills all segments start to next segments stop, if they don't have explicitly defined stop tags 188 | Adapts index field and the segment stop properties 189 | :return: 190 | """ 191 | for segment_class_index, segment_class in enumerate(self.my_classes): 192 | if not segment_class.enabled: 193 | continue 194 | if segment_class.is_start_segmented() is False: 195 | # the segment wasn't found at all so no filling needed 196 | continue 197 | if segment_class.is_stop_segmented() is True: 198 | # class already has stop and therefore doesn't need to be filled 199 | continue 200 | 201 | # search until next found tag 202 | for index in range(segment_class.start_line_index+1, len(self.index_field)): 203 | current_field_item = self.index_field[index] 204 | if current_field_item is not False: 205 | # next item begins, done with filling 206 | segment_class.set_stop_segmented(index-1) # toggles stop_segmented, sets index 207 | break 208 | else: 209 | # field item is False, fill with the current segment tag 210 | self.index_field[index] = segment_class.segment_tag 211 | 212 | 213 | def update_index_field(self, segmentation_class, only_start_tags=False): 214 | segment_tag = segmentation_class.segment_tag 215 | start_line_index = segmentation_class.start_line_index 216 | stop_line_index = segmentation_class.stop_line_index 217 | 218 | # if no start condition set - no update 219 | if start_line_index == -1: 220 | return 221 | 222 | # if start condition but no endcondition just update 1st line 223 | if stop_line_index == -1: 224 | stop_line_index = start_line_index + 1 225 | 226 | # fix some index glitches 227 | if start_line_index > stop_line_index: 228 | stop_line_index = start_line_index 229 | 230 | if start_line_index == stop_line_index: 231 | stop_line_index = start_line_index + 1 232 | 233 | # special option for debugging purposes 234 | if only_start_tags is True: 235 | stop_line_index = start_line_index 236 | 237 | for index in range(start_line_index, stop_line_index+1): 238 | self.index_field[index] = segment_tag 239 | 240 | def update_stop_tags(self, segmentation_class): 241 | segment_tag = segmentation_class.segment_tag 242 | start_line_index = segmentation_class.start_line_index 243 | stop_line_index = segmentation_class.stop_line_index 244 | index_field_len = len(self.index_field) 245 | # if segment_tag is "Verwaltung": 246 | # print("aqd") 247 | 248 | for index in range(start_line_index+1, index_field_len): 249 | 250 | # update until the next defined field appeads 251 | if self.index_field[index] is not False: 252 | break 253 | 254 | self.index_field[index] = segment_tag 255 | 256 | def instantiate_classification_classes(self): 257 | dict_test = SegmentHolder.__dict__.items() 258 | 259 | for key, value in dict_test: 260 | if inspect.isclass(value): 261 | my_instance = value() 262 | self.my_classes.append(my_instance) 263 | 264 | def finish_segment_matching(self, lines, feats, file_info): 265 | """ 266 | Final step in segmentation, covers special segmentation cases which i.e. can be done 267 | after everything else is segmented. 268 | :param lines: 269 | :param feats: 270 | :param file_info: 271 | :return: 272 | """ 273 | 274 | # special case: in end match firmenname 275 | for segment_class_index, segment_class in enumerate(self.my_classes): 276 | if not isinstance(segment_class, SegmentHolder.SegmentFirmenname): 277 | continue # skip firmenname at firsthand, this will be matched in the end 278 | 279 | start_updated = segment_class.match_start_condition(lines, lines, self.index_field, feats, len(lines), file_info,None) 280 | 281 | start_updated = False # self.number_of_lines, prev_line, combined_line) 282 | if start_updated: 283 | # there was a change -> update the indices fields 284 | self.update_index_field(segment_class) 285 | 286 | break # this only occurs once 287 | 288 | 289 | # overall function for iterating over all matches 290 | def match_my_segments(self, line, line_text, line_index, features, prev_line, combined_line): 291 | 292 | # 'only'-tagged class usage 293 | using_only_classes = False 294 | if len(self.my_only_indices) >= 1: 295 | using_only_classes = True 296 | 297 | # iterate classes 298 | for segment_class_index, segment_class in enumerate(self.my_classes): 299 | if not segment_class.enabled: 300 | continue 301 | 302 | if using_only_classes: 303 | # if at least one class was tagged only, skip all other classes who are only tagged 304 | if segment_class_index not in self.my_only_indices: 305 | continue 306 | 307 | 308 | if isinstance(segment_class, SegmentHolder.SegmentFirmenname) : 309 | continue # skip firmenname at firsthand, this will be matched in the end 310 | 311 | 312 | start_updated = False 313 | stop_updated = False 314 | 315 | 316 | if self.config.REMATCH_START_CONDITION_UNTIL_ZERO_ERROR is True: 317 | # do segmenting until error rate of zero is reached 318 | start_error_number_before_match = segment_class.get_start_error_number() 319 | if not segment_class.is_start_segmented() or segment_class.get_start_error_number() >= 1: 320 | start_updated = segment_class.match_start_condition(line, line_text, line_index, features, 321 | self.number_of_lines, prev_line, combined_line) 322 | start_error_number_after_match = segment_class.get_start_error_number() 323 | if start_error_number_before_match <= start_error_number_after_match: 324 | # only update if the recognized number is lower 325 | start_updated = False 326 | 327 | stop_error_number_before_match = segment_class.get_stop_error_number() 328 | if not segment_class.is_stop_segmented() or segment_class.get_stop_error_number() >= 1: 329 | stop_updated = segment_class.match_stop_condition(line, line_text, line_index, features, 330 | self.number_of_lines, prev_line, combined_line) 331 | stop_error_number_after_match = segment_class.get_stop_error_number() 332 | if stop_error_number_before_match <= stop_error_number_after_match: 333 | # only update if the recognized number is lower 334 | stop_updated = False 335 | 336 | else: 337 | # just hit the first match and stop matching then -> standard mode 338 | if not segment_class.is_start_segmented(): 339 | start_updated = segment_class.match_start_condition(line, line_text, line_index, features, 340 | self.number_of_lines, prev_line, combined_line) 341 | if not segment_class.is_stop_segmented(): 342 | stop_updated = segment_class.match_stop_condition(line, line_text, line_index, features, 343 | self.number_of_lines, prev_line, combined_line) 344 | 345 | if start_updated or stop_updated: 346 | 347 | if stop_updated: 348 | start_line_index = segment_class.start_line_index 349 | stop_line_index = segment_class.stop_line_index 350 | for segment in self.my_classes: 351 | if type(segment) == type(segment_class): 352 | continue 353 | current_start_line_index = segment.start_line_index 354 | current_stop_line_index = segment.stop_line_index 355 | 356 | if current_start_line_index != -1 and (current_start_line_index >= start_line_index and current_start_line_index <=stop_line_index): 357 | segment.set_start_segmented(-1) 358 | segment.start_was_segmented = False 359 | if current_stop_line_index != -1 and (current_stop_line_index >= start_line_index and current_stop_line_index <=stop_line_index): 360 | segment.set_stop_segmented(-1) 361 | segment.stop_was_segmented = False 362 | 363 | 364 | # there was a change -> update the indices fields 365 | self.update_index_field(segment_class) 366 | 367 | 368 | 369 | -------------------------------------------------------------------------------- /.idea/dbnavigator.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | -------------------------------------------------------------------------------- /lib/akf_parsing_functions_one.py: -------------------------------------------------------------------------------- 1 | from akf_corelib.conditional_print import ConditionalPrint 2 | from akf_corelib.configuration_handler import ConfigurationHandler 3 | from .data_helper import DataHelper as dh 4 | from .akf_parsing_functions_common import AKFCommonParsingFunctions as cf 5 | 6 | import regex 7 | 8 | 9 | class AkfParsingFunctionsOne(object): 10 | 11 | def __init__(self, endobject_factory, output_analyzer, dictionary_handler): 12 | config_handler = ConfigurationHandler(first_init=False) 13 | 14 | self.config = config_handler.get_config() 15 | self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_ONE, self.config.PRINT_EXCEPTION_LEVEL, 16 | self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__) 17 | 18 | self.cpr.print("init akf parsing functions one") 19 | 20 | self.ef = endobject_factory 21 | self.output_analyzer = output_analyzer 22 | self.dictionary_handler = dictionary_handler 23 | 24 | 25 | def parse_firmenname(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 26 | # get basic data 27 | element_counter = 0 28 | 29 | origpost, origpost_red, element_counter, content_texts = \ 30 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 31 | 32 | # get relevant info 33 | accumulated_text = "" 34 | for text in content_texts: 35 | accumulated_text += " " + text 36 | 37 | only_add_if_value = False 38 | accumulated_text = accumulated_text.strip() 39 | self.ef.add_to_my_obj("Firmenname", accumulated_text, object_number=element_counter, only_filled=only_add_if_value) 40 | 41 | 42 | def parse_sitz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 43 | """ 44 | "Sitz": [ 45 | { 46 | "origpost": "Mergenthalerallee 79-81, 65760 Eschborn Telefon:(069) 7 50 06-0 Telefax:(069) 7 50 06-111 e-mail:info@3u.net Internetseite:http://www.3u.net ", 47 | "type": "Sitz", 48 | "street": "Mergenthalerallee", 49 | "street_number": "79-81", 50 | "zip": "65760", 51 | "city": "Eschborn", 52 | "phone": "(069) 7 50 06-0", 53 | "fax": "(069) 7 50 06-111", 54 | "email": [ 55 | "info@3u.net" 56 | ], 57 | "www": [ 58 | "http://www.3u.net" 59 | ] 60 | } 61 | ], 62 | """ 63 | # get basic data 64 | element_counter = 0 65 | 66 | origpost, origpost_red, element_counter, content_texts = \ 67 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 68 | 69 | # get relevant info 70 | num_id, city, street, street_number, additional_info = cf.parse_id_location(origpost_red) 71 | 72 | # add stuff to ef 73 | only_add_if_value = True 74 | self.ef.add_to_my_obj("numID", num_id, object_number=element_counter, only_filled= only_add_if_value) 75 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled= only_add_if_value) 76 | self.ef.add_to_my_obj("street", street, object_number=element_counter, only_filled= only_add_if_value) 77 | self.ef.add_to_my_obj("street_number", street_number, object_number=element_counter, only_filled= only_add_if_value) 78 | self.ef.add_to_my_obj("additional_info", additional_info, object_number=element_counter, only_filled= only_add_if_value) 79 | 80 | return True 81 | 82 | def parse_verwaltung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 83 | # kmy_obj_2 = self.ef.print_me_and_return() 84 | # get basic data 85 | element_counter = 0 86 | origpost, origpost_red, element_counter, content_texts = \ 87 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 88 | 89 | # logme 90 | # self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag) 91 | 92 | if "srat" in real_start_tag: 93 | # Verwaltungsrat .. 94 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler, 95 | self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) 96 | only_add_if_filed = True 97 | for entry in persons_final: 98 | name, first_name, last_name, city, title, funct, rest_info = entry 99 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) 100 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, 101 | only_filled=only_add_if_filed) 102 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, 103 | only_filled=only_add_if_filed) 104 | 105 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) 106 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) 107 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) 108 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) 109 | 110 | element_counter += 1 111 | return True 112 | elif "Verw." in real_start_tag: 113 | # Verw. 114 | num_id, city, street, street_number, additional_info = cf.parse_id_location(origpost_red) 115 | 116 | # add stuff to ef 117 | only_add_if_value = True 118 | self.ef.add_to_my_obj("numID", num_id, object_number=element_counter, only_filled=only_add_if_value) 119 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value) 120 | self.ef.add_to_my_obj("street", street, object_number=element_counter, only_filled=only_add_if_value) 121 | self.ef.add_to_my_obj("street_number", street_number, object_number=element_counter, 122 | only_filled=only_add_if_value) 123 | self.ef.add_to_my_obj("additional_info", additional_info, object_number=element_counter, 124 | only_filled=only_add_if_value) 125 | 126 | return True 127 | else: 128 | # Verwaltung 129 | final_items = cf.parse_general_and_keys(content_texts, 130 | join_separated_lines=False, 131 | current_key_initial_value="General_Info") 132 | for key in final_items.keys(): 133 | value = final_items[key] 134 | if value is None or value == "": 135 | continue 136 | self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=True) 137 | element_counter += 1 138 | return True 139 | 140 | def parse_telefon_fernruf(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 141 | 142 | # get basic data 143 | origpost, origpost_red, element_counter, content_texts = cf.add_check_element(self, content_texts, 144 | real_start_tag, segmentation_class, 0) 145 | # do special match: Verwaltung und Betriebshof 146 | split_post = [] 147 | 148 | match_special = regex.match(r"(?Verwaltung.*)" 149 | r"(?Betriebshof.*)" 150 | , origpost_red) 151 | if match_special: 152 | betriebshof = match_special.group("Betr") 153 | verwaltung = match_special.group("Verw") 154 | origpost_red = origpost_red.replace(betriebshof, "") 155 | origpost_red = origpost_red.replace(verwaltung, "") 156 | split_post.append(betriebshof) 157 | split_post.append(verwaltung) 158 | # do special match: Ortsgespräche and Ferngespräche 159 | 160 | match_special2 = regex.match(r"(?Ortsgespräche.*)" 161 | r"(?Ferngespräche.*)" 162 | , origpost_red) 163 | if match_special2: 164 | ortsgespr = match_special2.group("og") 165 | ferngespr = match_special2.group("fg") 166 | origpost_red = origpost_red.replace(ortsgespr, "") 167 | origpost_red = origpost_red.replace(ferngespr, "") 168 | split_post.append(ortsgespr) 169 | split_post.append(ferngespr) 170 | 171 | 172 | 173 | # do special match: Ortsverkehr and Fernverkehr 174 | 175 | match_special3 = regex.match(r"(?Ortsverkehr.*)" 176 | r"(?Fernverkehr.*)" 177 | , origpost_red) 178 | if match_special3: 179 | ortsverkehr = match_special3.group("ov") 180 | fernverkehr = match_special3.group("fv") 181 | origpost_red = origpost_red.replace(ortsverkehr, "") 182 | origpost_red = origpost_red.replace(fernverkehr, "") 183 | split_post.append(ortsverkehr) 184 | split_post.append(fernverkehr) 185 | 186 | # do special match: check if only numbers 187 | origpost_red_new = origpost_red 188 | #only_num_check = origpost_red.replace("und", "").replace(",", "").replace(" ", "") 189 | test_split = regex.split("\su\.|\sund\s|,|;", origpost_red) 190 | for number in test_split: 191 | # additional parenthesis block 192 | match_parenthesis = regex.search("$.*$", number) 193 | parenthesis = None 194 | if match_parenthesis: 195 | parenthesis = match_parenthesis.group() 196 | number = number.replace(parenthesis,"") # remove number 197 | self.ef.add_to_my_obj("vorwahl", parenthesis, object_number=element_counter, only_filled=True) 198 | 199 | 200 | match_word_num = regex.search("(?[^\d]*)(?[\d\s\-/]*)", number) 201 | if match_word_num is None: 202 | continue 203 | 204 | word = match_word_num.group("word") 205 | num = match_word_num.group("num") 206 | if "Sa." in word and "Nr" in word: 207 | continue 208 | number_stripped = num.strip(" ./").replace("/", "").replace("-", "").replace(" ", "") 209 | if number_stripped.isdigit(): 210 | origpost_red_new = origpost_red_new.replace(number, "") # remove number 211 | origpost_red_new = origpost_red_new.replace(word, "") # remove word found 212 | 213 | change1 = self.ef.add_to_my_obj("number_Sa.-Nr.", num.strip(), object_number=element_counter, only_filled=True) 214 | change2 = self.ef.add_to_my_obj("location", word.strip(), object_number=element_counter, only_filled=True) 215 | if change1 or change2: 216 | element_counter += 1 217 | 218 | #if "32 20 47" in origpost_red: 219 | # print("asd") 220 | 221 | origpost_red = origpost_red_new 222 | # substitute in a separator char to integrate delimiters in next step 223 | origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red) 224 | 225 | # do further matches (sc-separated) 226 | split_post.extend(regex.split(';|~~~~|\su\.', origpost_red)) 227 | 228 | for index, entry in enumerate(split_post): 229 | if entry is None: 230 | continue 231 | entry_stripped = entry.strip() 232 | if entry_stripped == "": 233 | continue 234 | 235 | # additional parenthesis block 236 | match_parenthesis = regex.search("$.*$", entry_stripped) 237 | parenthesis = None 238 | if match_parenthesis: 239 | parenthesis = match_parenthesis.group() 240 | entry_stripped = entry_stripped.replace(parenthesis, "") # remove entry 241 | self.ef.add_to_my_obj("vorwahl", parenthesis, object_number=element_counter, only_filled=True) 242 | 243 | 244 | 245 | match_word = regex.match(r"(?\D*)" 246 | r"(?[\d\s\W]*)" 247 | ,entry_stripped) 248 | if match_word is not None: 249 | # fetch match results 250 | tag_match = match_word.group("Tag") 251 | numbers_match = match_word.group("Numbers") 252 | rest_from_entry_str = entry_stripped.replace(tag_match, "", 1) 253 | rest_from_entry_str = rest_from_entry_str.replace(numbers_match, "", 1) 254 | 255 | tag = dh.strip_if_not_none(tag_match, "") 256 | match_tag = regex.match(r"(?.*)(?Sa\.?\-Nr\.?)(?.*)", tag) 257 | location = "" 258 | if match_tag is not None: 259 | rest_tag = match_tag.group('rest_bef') 260 | rest_tag_2 = match_tag.group('rest_end') 261 | # sanr = match_tag.group('sanr') # this is the filtered group 262 | location = dh.strip_if_not_none(rest_tag + " " + rest_tag_2, ":., ") 263 | else: 264 | # if there are no real descriptors in tag then tag is usually location (like Düsseldorf 1 36 62.) 265 | location = tag 266 | 267 | if "und" in location: 268 | location = regex.sub("[^\w]und[^\w]", "", location) 269 | 270 | number = dh.strip_if_not_none(numbers_match, "., ") 271 | self.ef.add_to_my_obj("number_Sa.-Nr.", number.strip(), object_number=element_counter, only_filled=True) 272 | self.ef.add_to_my_obj("location", location.strip(), object_number=element_counter, only_filled=True) 273 | additional_info_entry_level = dh.strip_if_not_none(rest_from_entry_str, ",. ") 274 | self.ef.add_to_my_obj("additional_info", additional_info_entry_level.strip(), 275 | object_number=element_counter, only_filled=True) 276 | element_counter += 1 277 | 278 | origpost_red = origpost_red.replace(number, "", 1) 279 | origpost_red = origpost_red.replace(location, "", 1) 280 | 281 | origpost_red = origpost_red.replace("Sa.-Nr", "").replace("~~~~", "") 282 | origpost_red_end = dh.remove_multiple_outbound_chars(origpost_red) 283 | 284 | if len(origpost_red_end) > 3: 285 | self.ef.add_to_my_obj("additional_info_unparsed", origpost_red_end.strip(), object_number=element_counter) 286 | 287 | def parse_vorstand(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 288 | 289 | # get basic data 290 | element_counter = 0 291 | origpost, origpost_red, element_counter, content_texts = \ 292 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 293 | 294 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler, 295 | self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) 296 | 297 | only_add_if_filed = True 298 | for entry in persons_final: 299 | name, first_name, last_name, city, title, funct, rest_info = entry 300 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled=only_add_if_filed) 301 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, 302 | only_filled=only_add_if_filed) 303 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled=only_add_if_filed) 304 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_filed) 305 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_filed) 306 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled=only_add_if_filed) 307 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) 308 | element_counter += 1 309 | """ 310 | # do matches (;-separated) 311 | split_post = origpost_red.split(';') 312 | 313 | for index, entry in enumerate(split_post): 314 | entry_stripped = entry.strip() 315 | 316 | if index == len(split_post)-1: 317 | matchend = regex.match("^[Aa]lle", entry_stripped) 318 | if matchend: 319 | self.ef.add_to_my_obj("additional_info", entry_stripped, object_number=element_counter) 320 | element_counter += 1 321 | continue 322 | 323 | match = regex.match(r"(?.*)[,]" # find location string 324 | r"(?.*+)", # just get the rest which is usually streetname and number, but has other possibilities 325 | entry_stripped) 326 | if match is None: 327 | name = dh.strip_if_not_none(entry_stripped, ", ") 328 | self.ef.add_to_my_obj("name", name, object_number=element_counter) 329 | element_counter += 1 330 | continue 331 | 332 | name = dh.strip_if_not_none(match.group("Name"), ", ") 333 | rest = dh.strip_if_not_none(match.group("Rest"), ",. ") 334 | name_split = name.split(',') 335 | if len(name_split) > 1: 336 | position = rest 337 | name = name_split[0] 338 | city = name_split[1] 339 | else: 340 | city = rest 341 | position = "" 342 | 343 | self.ef.add_to_my_obj("name", name, object_number=element_counter) 344 | self.ef.add_to_my_obj("city", city, object_number=element_counter) 345 | self.ef.add_to_my_obj("position", position, object_number=element_counter) 346 | element_counter += 1 347 | """ 348 | 349 | return True 350 | 351 | def parse_aufsichtsrat(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 352 | 353 | # get basic data 354 | element_counter = 0 355 | origpost, origpost_red, element_counter, content_texts = \ 356 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 357 | 358 | #Try to fix +) problems 359 | origpost_red = origpost_red.replace("; +)","+);").replace(";+)","+);").replace("')","").replace("*)","") 360 | 361 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) 362 | 363 | only_add_if_filed = True 364 | for entry in persons_final: 365 | name, first_name, last_name, city, title, funct, rest_info = entry 366 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled= only_add_if_filed) 367 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled= only_add_if_filed) 368 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled= only_add_if_filed) 369 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled= only_add_if_filed) 370 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled= only_add_if_filed) 371 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled= only_add_if_filed) 372 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) 373 | element_counter += 1 374 | 375 | 376 | return True 377 | 378 | def parse_arbeitnehmervertreter(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 379 | # get basic data 380 | element_counter = 0 381 | origpost, origpost_red, element_counter, content_texts = \ 382 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 383 | 384 | persons_final = cf.parse_persons(origpost_red, self.dictionary_handler, self.config.USE_DICTIONARIES_FOR_PERSON_PARSING) 385 | only_add_if_filed = True 386 | for entry in persons_final: 387 | name, first_name, last_name, city, title, funct, rest_info = entry 388 | self.ef.add_to_my_obj("name", name, object_number=element_counter, only_filled= only_add_if_filed) 389 | self.ef.add_to_my_obj("first_name", first_name, object_number=element_counter, only_filled= only_add_if_filed) 390 | self.ef.add_to_my_obj("last_name", last_name, object_number=element_counter, only_filled= only_add_if_filed) 391 | self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled= only_add_if_filed) 392 | self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled= only_add_if_filed) 393 | self.ef.add_to_my_obj("rest", rest_info, object_number=element_counter, only_filled= only_add_if_filed) 394 | self.ef.add_to_my_obj("funct", funct, object_number=element_counter, only_filled=only_add_if_filed) 395 | 396 | element_counter += 1 397 | 398 | return True 399 | 400 | # Gruendung 401 | def parse_gruendung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 402 | # get basic data 403 | element_counter = 0 404 | origpost, origpost_red, element_counter, content_texts = \ 405 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 406 | match_year = regex.search("^\d*", origpost_red.strip()) 407 | if match_year: 408 | result = match_year.group() 409 | origpost_red_new = origpost_red.replace(result, "", 1) 410 | year = dh.strip_if_not_none(result, ".,() ") 411 | rest_info = dh.strip_if_not_none(origpost_red_new, ".,() ") 412 | self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=True) 413 | self.ef.add_to_my_obj("year", year, object_number=element_counter, only_filled=True) 414 | else: 415 | rest_info = dh.strip_if_not_none(origpost_red, ".,() ") 416 | self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=True) 417 | 418 | # Tätigkeitsgebiet 419 | def parse_taetigkeitsgebiet(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class): 420 | # get basic data 421 | element_counter = 0 422 | origpost, origpost_red, element_counter, content_texts = \ 423 | cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter) 424 | 425 | final_items = cf.parse_general_and_keys(content_texts, 426 | join_separated_lines=False, 427 | current_key_initial_value="General_Info") 428 | 429 | for key in final_items.keys(): 430 | value = final_items[key] 431 | if value is None or len(value) == 0: 432 | continue 433 | self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=True) 434 | element_counter += 1 --------------------------------------------------------------------------------