├── GPT_models ├── follow_up_q.py └── models.py ├── README.md ├── data ├── non_split │ ├── table_html │ │ └── example_tbl01.html │ └── table_json │ │ └── example_tbl01.json ├── pickle_folder │ └── example_tbl01.pickle └── split │ ├── json_representation │ ├── example_tbl01_01.json │ ├── example_tbl01_02.json │ ├── example_tbl01_03.json │ ├── example_tbl01_04.json │ ├── example_tbl01_05.json │ ├── example_tbl01_06.json │ ├── example_tbl01_07.json │ ├── example_tbl01_08.json │ ├── example_tbl01_09.json │ ├── example_tbl01_10.json │ ├── example_tbl01_11.json │ ├── example_tbl01_12.json │ └── example_tbl01_13.json │ ├── table_split_html │ ├── example_tbl01_01.html │ ├── example_tbl01_02.html │ ├── example_tbl01_03.html │ ├── example_tbl01_04.html │ ├── example_tbl01_05.html │ ├── example_tbl01_06.html │ ├── example_tbl01_07.html │ ├── example_tbl01_08.html │ ├── example_tbl01_09.html │ ├── example_tbl01_10.html │ ├── example_tbl01_11.html │ ├── example_tbl01_12.html │ └── example_tbl01_13.html │ ├── table_split_json │ ├── example_tbl01_01.json │ ├── example_tbl01_02.json │ ├── example_tbl01_03.json │ ├── example_tbl01_04.json │ ├── example_tbl01_05.json │ ├── example_tbl01_06.json │ ├── example_tbl01_07.json │ ├── example_tbl01_08.json │ ├── example_tbl01_09.json │ ├── example_tbl01_10.json │ ├── example_tbl01_11.json │ ├── example_tbl01_12.json │ └── example_tbl01_13.json │ └── tsv_representation │ ├── example_tbl01_01.txt │ ├── example_tbl01_02.txt │ ├── example_tbl01_03.txt │ ├── example_tbl01_04.txt │ ├── example_tbl01_05.txt │ ├── example_tbl01_06.txt │ ├── example_tbl01_07.txt │ ├── example_tbl01_08.txt │ ├── example_tbl01_09.txt │ ├── example_tbl01_10.txt │ ├── example_tbl01_11.txt │ ├── example_tbl01_12.txt │ └── example_tbl01_13.txt ├── input_generation_script.json ├── model_evaluation ├── evaluation.py └── utils │ ├── __pycache__ │ ├── functions.cpython-37.pyc │ ├── get_keys.cpython-37.pyc │ ├── get_keys_function.cpython-37.pyc │ └── get_keys_function.cpython-38.pyc │ └── functions.py ├── model_script.json ├── requirements_conda.txt ├── requirements_pip.txt ├── run.py ├── table_representation ├── __pycache__ │ ├── table2json_upload.cpython-37.pyc │ └── table_representer_upload.cpython-37.pyc ├── table2json.py └── table_representer.py └── table_splitting ├── __pycache__ └── split_table_.cpython-37.pyc └── split_table.py /GPT_models/follow_up_q.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import openai 4 | import pandas as pd 5 | 6 | openai.api_key = "your api key" 7 | 8 | class FollowQ: 9 | def __init__(self, json_path, representation_path, save_path): 10 | self.json_path = json_path 11 | self.representation_path = representation_path 12 | self.save_path = save_path 13 | 14 | # Questions related to catalysts 15 | self.questions_for_catalyst = { 16 | "1_list": ["representation", "Question 1. Please tell me the names of catalysts in the contents within the tags of input representation into a Python list. Give me the names of catalysts ONLY. Only output the Python list(like string).\n\n"], 17 | "2_list": ["json", "Question 2. Please tell me the names of catalysts from the input json provided by me into a Python list. Only output the Python list(like string).\n\n"], 18 | "3_list": ["None", "Question 3. Based on the answer to Question 1, modify or remove any catalysts from the answer to Question 2 and provide the updated list in Python. Give me the names of catalysts ONLY. Only output the Python list.(like string)\n\n"] 19 | } 20 | 21 | # Questions related to performance 22 | self.questions_for_performance = { 23 | "1_list": ["json", "Question 1. Inform me about what performance type does {catalyst_name} have in the input json I provided? Only output the Python list.(like string)\n\n"] 24 | } 25 | 26 | # Questions related to properties 27 | self.questions_for_property = { 28 | "1_both": ["json", "Question 1. Provide detailed information about all sublayers of the {element} of {catalyst_name} in input json. Remove keys from the dictionary that do not have a value. Present it in either Python list or JSON format. If the {element} is not 'loading', strictly provide it in Python list or JSON format.(like string not ```python and not ```json)\n\n"], 29 | "2_str": ["None", "Question 2. If there is any occurrence of 'NA', 'na', 'unknown', or similar content in your recent response, Respond with yes or no. You must answer strictly with yes or no.\n\n"], 30 | "3_dict": ["None", "Question 3. In the answer to question 1, remove any parts corresponding to 'NA', 'na', 'unknown', or similar contents. Show the modified JSON. Only display the JSON. (like string not ```json)\n\n"], 31 | "4_list": ["representation", "Question 4. Based on the input representation, provide values of the {element} of the {catalyst_name} as a Python list. If there is a unit, please provide strictly the value including the unit. The elements of a Python list must be composed of value plus unit. Only output the Python list.(like string not ```python)\n\n"], 32 | "5_list": ["None", "Question 5. Based on the answer to question 3, provide values of the '''value''' key of the sublayers of the {element} as a Python list. Only output the Python list.(like string not ```python)\n\n"], 33 | "6_list": ["None", "Question 6. Based on only numerical values, provide a list of elements that exist in the answer to Question 5 but are not present in the the answer to Question 4. Note that unit differences can be ignored if the numbers match. Only output the Python list.(like string not ```python)\n\n"], 34 | "7_dict": ["json", "Question 7. If elements included in the list that is the answer to question 6 are in the answer to question 1, remove the sub-dictionary containing those elements from the json I provided. If the answer to 6 is a list containing elements, be sure to delete it from json. Show the modified JSON after removal. Only display the JSON. (like string not ```json)\n\n"], 35 | "8_dict": ["json", "Question 8. Please tell me the final modified json of {catalyst_name} by reflecting the answer to question 7 in the json I provided. Only output the JSON of {catalyst_name}. the catalyst_name is {catalyst_name}. The first key of the dictionary should be {catalyst_name}. Remove keys from the dictionary that do not have a value. (like string not ```json)"] 36 | } 37 | 38 | # Questions related to electrolyte, reaction_type, substrate 39 | self.questions_for_representation = { 40 | "1_str": ["representation_title", "Question 1. If there is any occurrence of 'OER', 'HER', 'oxygen evolution reaction', 'hydrogen evolution reaction' or similar contents in input representation, respond with yes or no. Please answer with either yes or no.\n\n"], 41 | "2_str": ["representation_table_caption", "Question 2. If there is any occurrence of 'OER', 'HER', 'oxygen evolution reaction', 'hydrogen evolution reaction' or similar contents in input representation, respond with yes or no. Please answer with either yes or no.\n\n"], 42 | "3_str": ["representation_title", "Question 3. Does the input representation contain information corresponding to substrate? Please answer with either yes or no\n\n"], 43 | "4_str": ["representation_table_caption", "Question 4. Does the input representation contain information corresponding to substrate? Please answer with either yes or no\n\n"], 44 | "5_str": ["representation_title", "Question 5. Does the input representation contain information corresponding to electrolyte? Please answer with either yes or no\n\n"], 45 | "6_str": ["representation_table_caption", "Question 6. Does the input representation contain information corresponding to electrolyte? Please answer with either yes or no\n\n"] 46 | } 47 | 48 | # system prompt 49 | self.system_prompt = {"role": "system", "content": "You need to modify the JSON representing the table presenter.\n\n JSON templete : {'catalyst_name' : {'performance_name' : \{property templete\}}}\n property templete : {'electrolyte': '', 'reaction_type': '', 'value': '', 'current_density': '', 'overpotential': '', 'potential': '', 'substrate': '', 'versus': '', 'condition': ''}\n performance_list = [overpotential, tafel_slope, Rct, stability, Cdl, onset_potential, potential, TOF, ECSA, water_splitting_potential, mass_activity, exchange_current_density, Rs, specific_activity, onset_overpotential, BET, surface_area, loading, apparent_activation_energy]\n In the JSON template, 'catalyst_name' and 'performance_name' should be replaced with the actual names present in the input representation."} 50 | 51 | def input_prompt(self, representation, json, want_type): 52 | """ 53 | Generates a formatted input string based on the type of content requested. 54 | 55 | Parameters: 56 | representation (str): The input representation string containing HTML content. 57 | json (str): The JSON string to be included in the input. 58 | want_type (str): The type of content required ('both', 'representation', 'json', 'representation_title', 'representation_table_caption'). 59 | 60 | Returns: 61 | str: The formatted input string. 62 | """ 63 | 64 | # Split the representation string at each occurrence of '
' 65 | splitted_strings = representation.split('') 66 | # Determine the format based on the requested type 67 | if want_type == 'both': 68 | format_for_input = "\n" + str(representation) + "\n\n" + str(json) + "\n\n" 69 | elif want_type == 'representation': 70 | format_for_input = "\n" + str(representation) + "\n\n" 71 | elif want_type == 'json': 72 | format_for_input = "\n" + str(json) + "\n\n" 73 | elif want_type == 'representation_title': 74 | title_string = splitted_strings[0] + '' 75 | format_for_input = "\n" + str(title_string) + "\n\n" 76 | elif want_type == 'representation_table_caption': 77 | table_caption_string = splitted_strings[1] 78 | format_for_input = "\n" + str(table_caption_string) + "\n\n" 79 | 80 | return format_for_input 81 | 82 | def load_file(self, file_type, file_path, file_name): 83 | """ 84 | Loads the content of a file based on the specified type and path. 85 | 86 | Parameters: 87 | file_type (str): The type of the file (json, html, txt, etc.) 88 | file_path (str): The path to the file 89 | file_name (str): The name of the file 90 | 91 | Returns: 92 | output: The content of the file. The type of the content depends on the file_type. 93 | 94 | Raises: 95 | ValueError: If the file type is unsupported. 96 | """ 97 | with open(file_path + file_name, 'r', encoding='utf-8-sig') as f: 98 | # Load JSON files 99 | if file_type == 'json': 100 | output = json.load(f) 101 | # Read text or HTML files 102 | elif file_type in ['html', 'txt']: 103 | output = f.read() 104 | # Raise an error for unsupported file formats 105 | else: 106 | raise ValueError("Unsupported file format.") 107 | 108 | return output 109 | 110 | def formatting_type(self, key, answer): 111 | """ 112 | Formats the answer based on the specified key type. 113 | 114 | Parameters: 115 | key (str): The key indicating the type of format required (e.g., '1_list', '2_dict'). 116 | answer (str): The answer to be formatted. 117 | 118 | Returns: 119 | The formatted answer, which can be a list or a dictionary based on the key type. 120 | """ 121 | # Determine the desired format type from the key 122 | want_type = key.split('_')[1] 123 | 124 | # Handle formatting if the desired type is a list 125 | if want_type == "list": 126 | if answer[0] == '"' and answer[-1] == '"': 127 | answer = answer.strip('"') # Remove surrounding quotes if present 128 | answer = eval(answer) # Evaluate the string as a Python expression (e.g., convert to list) 129 | 130 | # Handle formatting if the desired type is a dictionary 131 | elif want_type == "dict": 132 | if "```" in answer: 133 | answer = answer.replace("```json", "").replace("```", "") # Remove markdown code block formatting 134 | answer = json.loads(answer) # Parse the string as JSON 135 | 136 | return answer 137 | 138 | def check_type(self, key, answer): 139 | """ 140 | Checks if the type of the answer matches the expected type based on the key. 141 | 142 | Parameters: 143 | key (str): The key indicating the expected type (e.g., '1_list', '2_dict'). 144 | answer (any): The answer whose type needs to be checked. 145 | 146 | Returns: 147 | tuple: A tuple containing the expected type (str) and a boolean indicating whether the types match. 148 | """ 149 | # Extract the question number and expected type from the key 150 | question_number = key.split('_')[0] 151 | want_type = key.split('_')[1] 152 | 153 | # Determine the actual type of the answer 154 | answer_type = type(answer).__name__ 155 | 156 | # Check if the expected type matches the actual type 157 | type_bool = want_type == answer_type 158 | 159 | return want_type, type_bool 160 | 161 | def prompt(self, Q): 162 | """ 163 | Sends a list of messages to the OpenAI ChatCompletion API and returns the response. 164 | 165 | Parameters: 166 | Q (list): A list of message dictionaries to be sent to the API. 167 | 168 | Returns: 169 | tuple: A tuple containing the original list of messages (Q) and the response content. 170 | """ 171 | while True: 172 | try: 173 | # Send a request to the OpenAI ChatCompletion API 174 | response = openai.ChatCompletion.create( 175 | model="gpt-4-0125-preview", 176 | messages=Q, 177 | temperature=0, 178 | frequency_penalty=0, 179 | presence_penalty=0 180 | ) 181 | break # Exit the loop if the request is successful 182 | except Exception as e: 183 | # Print the error message and retry 184 | print("An error occurred:", e) 185 | 186 | # Return the original messages and the content of the response 187 | return Q, response['choices'][0]['message']['content'] 188 | 189 | def run(self, input_type, split_mode): 190 | """ 191 | Executes the main process for modifying JSON files based on given questions. 192 | 193 | Parameters: 194 | input_type (str): The type of input files (e.g., 'json', 'txt'). 195 | split_mode (str): The mode for handling splits (e.g., 'split', 'no-split'). 196 | 197 | Returns: 198 | None 199 | """ 200 | json_name_list = os.listdir(self.json_path) 201 | error_file_name = [] 202 | transpose_bool = False 203 | 204 | for json_name in json_name_list: 205 | # try: 206 | txt_name = json_name.split('.')[0] 207 | # Load input files 208 | input_json = self.load_file('json', self.json_path, json_name) 209 | input_representation = self.load_file(input_type, self.representation_path, txt_name + '.txt') 210 | 211 | # Initialize messages with system prompt 212 | messages = [self.system_prompt] 213 | 214 | # Catalyst check 215 | catalyst_result = {} 216 | question_list = [] 217 | answer_list = [] 218 | question_token_list = [] 219 | answer_token_list = [] 220 | final_result = [] 221 | final_json = {"catalysts": []} 222 | 223 | # Iterate through catalyst questions 224 | for key, value in self.questions_for_catalyst.items(): 225 | if value[0] != "None": 226 | messages.append({"role": "user", "content": self.input_prompt(input_representation, input_json, value[0])}) 227 | messages.append({"role": "user", "content": value[1]}) 228 | messages, answer = self.prompt(messages) 229 | question_token_list.append(messages) 230 | answer_token_list.append(answer) 231 | 232 | try: 233 | mod_answer = self.formatting_type(key, answer) 234 | except Exception as e: 235 | try: 236 | messages, answer = self.prompt(messages) 237 | question_token_list.append(messages) 238 | answer_token_list.append(answer) 239 | question_list.append("Throwing the same message once again") 240 | answer_list.append("Throwing the same message once again") 241 | except Exception as e: 242 | error_file_name.append(json_name) 243 | print(json_name) 244 | break 245 | 246 | catalyst_result[key[0]] = mod_answer 247 | question_list.append(value[1]) 248 | answer_list.append(mod_answer) 249 | 250 | print('--------------------------') 251 | print(self.questions_for_catalyst[key]) 252 | print(mod_answer) 253 | print(catalyst_result) 254 | print('--------------------------') 255 | 256 | messages.append({"role": "assistant", "content": answer}) 257 | if key[0] == '2' and catalyst_result["1"] == catalyst_result["2"]: 258 | print("GO TO THE PERFORMANCE STAGE !!") 259 | break 260 | 261 | # Performance check 262 | performance_result = {} 263 | mod_catalyst_list = answer_list[-1] 264 | 265 | if len(mod_catalyst_list) > 1: 266 | transpose_bool = True 267 | 268 | for i in range(len(mod_catalyst_list)): 269 | messages = [self.system_prompt] 270 | for key, value in self.questions_for_performance.items(): 271 | if value[0] != "None": 272 | messages.append({"role": "user", "content": self.input_prompt(input_representation, input_json, value[0])}) 273 | question = value[1].format(catalyst_name = '"""'+ mod_catalyst_list[i] +'"""') 274 | messages.append({"role": "user", "content": question}) 275 | messages, answer = self.prompt(messages) 276 | question_token_list.append(messages) 277 | answer_token_list.append(answer) 278 | 279 | try: 280 | mod_answer = self.formatting_type(key, answer) 281 | except: 282 | try: 283 | messages, answer = self.prompt(messages) 284 | question_token_list.append(messages) 285 | answer_token_list.append(answer) 286 | except: 287 | error_file_name.append(json_name) 288 | 289 | break 290 | 291 | performance_result[f'{mod_catalyst_list[i]}_{key[0]}'] = mod_answer 292 | question_list.append(question) 293 | answer_list.append(mod_answer) 294 | 295 | messages.append({"role": "assistant", "content": answer}) 296 | if key[0] == '3' and performance_result[f'{mod_catalyst_list[i]}_3'] == []: 297 | print("GO TO THE PROPERTY STAGE !!") 298 | break 299 | 300 | if isinstance(answer_list[-1], list): 301 | mod_performance_list = performance_result[f'{mod_catalyst_list[i]}_1'] 302 | else: 303 | while not isinstance(answer_list[-1], list): 304 | messages.pop() 305 | messages, answer = self.prompt(messages) 306 | question_token_list.append(messages) 307 | answer_token_list.append(answer) 308 | mod_answer = self.formatting_type("1_list", answer) 309 | question_list.append(question) 310 | answer_list.append(mod_answer) 311 | mod_performance_list = answer 312 | 313 | # Property check 314 | property_result = {} 315 | skip_questions = False 316 | print("#####################") 317 | print(mod_performance_list) 318 | 319 | if mod_performance_list == []: 320 | mod_answer = {str(mod_catalyst_list[i]): {}} 321 | else: 322 | for j in range(len(mod_performance_list)): 323 | messages = [] 324 | messages.append(self.system_prompt) 325 | print("@@@@@@@@@@@@") 326 | print(mod_performance_list[j]) 327 | # system prompt 넣어주고, input 표현자, json 넣어주는 코드 328 | # messages = [] 329 | # messages.append(self.system_prompt) 330 | 331 | skip_questions = False 332 | anwer3_no = False 333 | for key, value in self.questions_for_property.items(): 334 | # 질문 2의 답변이 "no"일 때 질문 3번과 4번 skip하기 위한 조건문 335 | if key[0] in ['3','7'] and skip_questions: 336 | print("SKIP THE NEXT QUESTIONS") 337 | question_list.append("SKIP THE NEXT QUESTIONS") 338 | answer_list.append("SKIP THE NEXT QUESTIONS") 339 | if key[0] in ['3','7']: 340 | skip_questions = False 341 | continue # 질문 3번과 4번 건너뛰기 342 | 343 | if value[0] != "None": 344 | messages.append({"role": "user", "content": self.input_prompt(input_representation, input_json, value[0])}) 345 | 346 | question = value[1].format(catalyst_name = '"""'+ mod_catalyst_list[i] +'"""', element = '"""'+ mod_performance_list[j] +'"""') 347 | messages.append({"role": "user", "content": question}) 348 | messages, answer = self.prompt(messages) 349 | question_token_list.append(messages) 350 | answer_token_list.append(answer) 351 | try: 352 | mod_answer = self.formatting_type(key, answer) 353 | except: 354 | try: 355 | messages, answer = self.prompt(messages) 356 | question_token_list.append(messages) 357 | answer_token_list.append(answer) 358 | except: 359 | print(json_name) 360 | error_file_name.append(json_name) 361 | break 362 | 363 | property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_' + key[0]] = answer 364 | question_list.append(question) 365 | answer_list.append(mod_answer) 366 | 367 | print('--------------------------') 368 | print(question) 369 | print(mod_answer) 370 | print('--------------------------') 371 | 372 | messages.append({"role": "assistant", "content": answer}) 373 | 374 | if key[0] == '2' and mod_answer.lower() == "no": 375 | question_list.append("Question 3. Based on the answer to question 2, remove any parts corresponding to 'NA', 'na', 'unknown', or similar content from the answer to question 1. Show the modified JSON. Only display the JSON. (like string not ```json)") 376 | answer_list.append(str(property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_1'])) 377 | messages.append({"role": "user", "content": "Question 3. Based on the answer to question 2, remove any parts corresponding to 'NA', 'na', 'unknown', or similar content from the answer to question 1. Show the modified JSON. Only display the JSON. (like string not ```json)"}) 378 | messages.append({"role": "assistant", "content": property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_1']}) 379 | skip_questions = True 380 | anwer3_no = True 381 | 382 | if key[0] == '6' and mod_answer == []: 383 | if anwer3_no: 384 | question_list.append("Question 7. If the answer to question 6 is an empty list, just provide the answer to question 1 as it is.") 385 | answer_list.append(str(property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_1'])) 386 | messages.append({"role": "user", "content": "Question 7. If the answer to question 6 is an empty list, just provide the answer to question 1 as it is."}) 387 | messages.append({"role": "assistant", "content": property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_1']}) 388 | skip_questions = True 389 | else: 390 | question_list.append("Question 7. If the answer to question 6 is an empty list, just provide the answer to question 3 as it is.") 391 | answer_list.append(str(property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_3'])) 392 | messages.append({"role": "user", "content": "Question 7. If the answer to question 6 is an empty list, just provide the answer to question 3 as it is."}) 393 | messages.append({"role": "assistant", "content": property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_3']}) 394 | skip_questions = True 395 | 396 | if isinstance(mod_answer, dict): 397 | input_json = mod_answer 398 | else: 399 | count = 0 400 | while not isinstance(mod_answer, dict) and count < 3: 401 | messages.pop() 402 | messages.append({"role": "user", "content": self.input_prompt(input_representation, input_json, 'json')}) 403 | messages, answer = self.prompt(messages) 404 | question_token_list.append(messages) 405 | answer_token_list.append(answer) 406 | try: 407 | mod_answer = self.formatting_type(key, answer) 408 | except: 409 | try: 410 | messages, answer = self.prompt(messages) 411 | question_token_list.append(messages) 412 | answer_token_list.append(answer) 413 | except: 414 | print('#$@%@#%@#%#@%@#%@#') 415 | print(json_name) 416 | print('#$@%@#%@#%#@%@#%@#') 417 | error_file_name.append(json_name) 418 | break 419 | 420 | count += 1 421 | 422 | property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_' + key[0]] = answer 423 | question_list.append(question) 424 | answer_list.append(mod_answer) 425 | 426 | print('--------------------------') 427 | print(question) 428 | print(mod_answer) 429 | print('--------------------------') 430 | 431 | messages.append({"role": "assistant", "content": answer}) 432 | 433 | if len(mod_catalyst_list) == 1 and split_mode == 'split': 434 | final_result.append(mod_answer) 435 | else: 436 | input_json = self.load_file(input_type, self.json_path, json_name) 437 | final_json["catalysts"].append(mod_answer) 438 | 439 | if transpose_bool: 440 | final_result.append(final_json) 441 | 442 | # Final JSON after property modifications 443 | if final_result[0] == []: 444 | input_json = self.load_file('json', self.json_path, json_name) 445 | new_json = input_json 446 | else: 447 | new_json = final_result[0] 448 | 449 | # Handle representation questions 450 | remove_elements = [] 451 | representation_result = {} 452 | for key, value in self.questions_for_representation.items(): 453 | messages = [] 454 | if value[0] == "None": 455 | pass 456 | else: 457 | messages.append({"role": "user", "content": self.input_prompt(input_representation, new_json, value[0])}) 458 | 459 | question = value[1] 460 | messages.append({"role": "user", "content": question}) 461 | messages, answer = self.prompt(messages) 462 | question_token_list.append(messages) 463 | answer_token_list.append(answer) 464 | 465 | mod_answer = self.formatting_type(key, answer) 466 | 467 | question_list.append(question) 468 | answer_list.append(mod_answer) 469 | 470 | print('--------------------------') 471 | print(question) 472 | print(mod_answer) 473 | print('--------------------------') 474 | 475 | messages.append({"role": "assistant", "content": answer}) 476 | representation_result[key[0]] = mod_answer.replace(".", "").lower() 477 | 478 | if representation_result['1'] == 'no' and representation_result['2'] == 'no': 479 | remove_elements.append('reaction_type') 480 | if representation_result['3'] == 'no' and representation_result['4'] == 'no': 481 | remove_elements.append('substrate') 482 | if representation_result['5'] == 'no' and representation_result['6'] == 'no': 483 | remove_elements.append('electrolyte') 484 | 485 | remove_elements = list(set(remove_elements)) 486 | 487 | if remove_elements != []: 488 | for delete_element in remove_elements: 489 | messages = [] 490 | messages.append({"role": "user", "content": self.input_prompt(input_representation, new_json, 'json')}) 491 | question = "Remove all elements with the key name {delete_element} from the input JSON and display it in only JSON format. Other explanation is not allowed. Show me only JSON result. Only display the JSON. (like string not ```json)".format(delete_element="'''"+delete_element+"'''") 492 | messages.append({"role": "user", "content": question}) 493 | messages, answer = self.prompt(messages) 494 | question_token_list.append(messages) 495 | answer_token_list.append(answer) 496 | try: 497 | mod_answer = self.formatting_type('1_dict', answer) 498 | except: 499 | try: 500 | messages, answer = self.prompt(messages) 501 | question_token_list.append(messages) 502 | answer_token_list.append(answer) 503 | except: 504 | print('#$@%@#%@#%#@%@#%@#') 505 | print(json_name) 506 | print('#$@%@#%@#%#@%@#%@#') 507 | error_file_name.append(json_name) 508 | break 509 | 510 | question_list.append(question) 511 | answer_list.append(mod_answer) 512 | print('--------------------------') 513 | print(question) 514 | print('answer') 515 | print(answer) 516 | print('mod answer') 517 | print(mod_answer) 518 | print('--------------------------') 519 | 520 | if not isinstance(mod_answer, dict): 521 | count = 0 522 | while not isinstance(mod_answer, dict) and count < 3: 523 | # 원래 했던 질문 제거하고 다시 524 | messages, answer = self.prompt(messages) 525 | question_token_list.append(messages) 526 | answer_token_list.append(answer) 527 | try: 528 | mod_answer = self.formatting_type('1_dict', answer) 529 | except: 530 | try: 531 | messages, answer = self.prompt(messages) 532 | question_token_list.append(messages) 533 | answer_token_list.append(answer) 534 | except: 535 | print('#$@%@#%@#%#@%@#%@#') 536 | print(json_name) 537 | print('#$@%@#%@#%#@%@#%@#') 538 | error_file_name.append(json_name) 539 | break 540 | 541 | count += 1 542 | question_list.append(question) 543 | answer_list.append(mod_answer) 544 | 545 | print('--------------------------') 546 | print(question) 547 | print(mod_answer) 548 | print('--------------------------') 549 | 550 | if not isinstance(mod_answer, dict): 551 | new_json = new_json 552 | else: 553 | new_json = mod_answer 554 | else: 555 | mod_answer = new_json 556 | 557 | # Ensure the necessary directories exist 558 | os.makedirs(os.path.join(self.save_path, 'log'), exist_ok=True) 559 | os.makedirs(os.path.join(self.save_path, 'token'), exist_ok=True) 560 | os.makedirs(os.path.join(self.save_path, 'json'), exist_ok=True) 561 | 562 | # Save the modified JSON and log 563 | if json_name not in error_file_name: 564 | log_path = self.save_path + 'log/'+ txt_name 565 | df = pd.DataFrame({'Question': question_list, 'GPT answer': answer_list}) 566 | df.to_csv(log_path+'.csv', index=False) 567 | 568 | token_path = self.save_path + 'token/'+ txt_name 569 | token_df = pd.DataFrame({'Question': question_token_list, 'GPT answer': answer_token_list}) 570 | token_df.to_csv(token_path+'.csv', index=False) 571 | 572 | new_json_path = self.save_path + 'json/'+ json_name 573 | if mod_answer == [] : 574 | input_json = self.load_file('json', self.json_path, json_name) 575 | with open(new_json_path, "w") as json_file: 576 | json.dump(input_json, json_file, indent=4) 577 | else: 578 | if isinstance(mod_answer, list): 579 | with open(new_json_path, "w") as json_file: 580 | json.dump(new_json, json_file, indent=4) 581 | 582 | elif isinstance(mod_answer, str): 583 | with open(new_json_path, "w") as json_file: 584 | json.dump(new_json, json_file, indent=4) 585 | else: 586 | with open(new_json_path, "w") as json_file: 587 | json.dump(mod_answer, json_file, indent=4) 588 | 589 | 590 | if __name__ == "__main__": 591 | representation_path = 'table representer path' 592 | json_path = 'gpt prediction' 593 | save_path = 'save path' 594 | 595 | assistant = FollowQ(json_path, representation_path, save_path) 596 | assistant.run('txt', 'split') 597 | 598 | 599 | -------------------------------------------------------------------------------- /GPT_models/models.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import openai 4 | import time 5 | from ast import literal_eval 6 | from copy import copy 7 | 8 | def fine_tuning(input_path, output_path): 9 | """ 10 | Test fine_tuning model, generates the prediction of table data extraction in json format. 11 | You can use OPENAI PLAYGROUND for training the model. 12 | 13 | Parameters: 14 | input_path : TSV or JSON table representation path 15 | output_path : The path where the prediction is saved 16 | 17 | Returns: 18 | json : prediction of table data extraction 19 | """ 20 | 21 | openai.api_key = "YOUR OPENAI KEY" 22 | file_list = os.listdir(input_path) 23 | 24 | response = [] 25 | for file in file_list: 26 | file_path = ''.join([output_path, file]) 27 | file_name = os.path.basename(file_path) 28 | with open( input_path + file, 'r', encoding='utf-8-sig') as file: 29 | loaded_data_string = json.load(file) 30 | 31 | completion = openai.ChatCompletion.create( 32 | model="FINE TUNED MODEL", 33 | temperature=0, 34 | messages=[ 35 | {"role": "system", "content": "this task is to take a string as input and convert it to json format. I want to extract the performance below. [reaction_type, versus, overpotential, substrate, loading, tafel_slope, onset_potential, current_density, BET, specific_activity, mass_activity, surface_area, ECSA, apparent_activation_energy, water_splitting_potential, potential, Rs, Rct, Cdl, TOF, stability, electrolyte, exchange_current_density, onset_overpotential]. If there is information about overpotential and Tafel slope in the input, the output should be generated as follows.\n\n{\n\u201dcatalyst_name\": {\n\"overpotential\": {\n\"electrolyte\": \"1.0 M KOH\",\n\"reaction_type\": \"OER\",\n\"value\": \"230 mV\",\n\"current_density\": \"50 mA/cm2\"\n},\n\"tafel_slope\": {\n\"electrolyte\": \"1.0 M KOH\",\n\"reaction_type\": \"OER\",\n\"value\": \"54 mV/dec\"\n}\n\n}\n\n}\n\nIf information regarding the reaction_type or electrolyte cannot be found in the input, they should not be included in the output as follows.\n\n{\n\u201dcatalyst_name\": {\n\"overpotential\": {\n\"value\": \"230 mV\",\n\"current_density\": \"50 mA/cm2\"\n},\n\"tafel_slope\": {\n\"value\": \"54 mV/dec\"\n}\n\n}\n\n}\n\nIf the string is missing certain information such as 'mass_activity', ‘reaction_type’, ‘value’ or 'current_density', your output should not include those keys.\n\nIf there are no values corresponding to the mentioned performance metrics in the input, simply extract the catalyst name as shown below.\n\n{\n\u201dcatalyst_name\": {}\n}\n\nNote: The output should be a JSON object with keys for only the present metrics."}, 36 | {"role": "user", "content": str(loaded_data_string)} 37 | ] 38 | ) 39 | result = completion.choices[0].message['content'] 40 | response.append(result) 41 | try: 42 | dict_1 = literal_eval(result) 43 | json_file_path = os.path.join(output_path, file_name) 44 | with open(file_path[:-5]+'.json', "w", encoding="utf-8-sig") as json_file: 45 | json.dump(dict_1, json_file, indent=4) 46 | except: 47 | with open(file_path[:-5]+'.txt', "w", encoding="utf-8-sig") as file: 48 | file.write(result) 49 | 50 | 51 | 52 | def few_shot(input_path, output_path) : 53 | """ 54 | Test few shot model, generates the prediction of table data extraction in json format. 55 | You need to give several I/O pairs. 56 | 57 | Parameters: 58 | input_path : TSV or JSON table representation path 59 | output_path : The path where the prediction is saved 60 | 61 | Returns: 62 | json : prediction of table data extraction 63 | """ 64 | client = OpenAI(api_key= "YOUR OPENAI KEY") 65 | file_list = os.listdir(input_path) 66 | for file in file_list : 67 | with open(input_path + file, 'r', encoding='utf-8') as file: 68 | text = file.read() 69 | response = client.chat.completions.create( 70 | model="gpt-4-1106-preview", 71 | temperature=0, 72 | frequency_penalty=0, 73 | presence_penalty=0, 74 | messages=[ 75 | {"role": "system", "content": "I will extract the performance information of the catalyst from the table and create a JSON format. The types of performance to be extracted will be given as a list: performance_list = [overpotential, tafel_slope, Rct, stability, Cdl, onset_potential, current_density, potential, TOF, ECSA, water_splitting_potential, mass_activity, exchange_current_density, Rs, specific_activity, onset_overpotential, BET, surface_area, loading, apparent_activation_energy]. You can only use the names as they are in the performance_list, and any changes to the names in the performance_list, no matter how slight, are not allowed. The JSON format will have performance within the catalyst, and each performance will include elements present in the table: reaction type, value, electrolyte, condition, current density, versus(ex: RHE) and substrate. Other elements must not be included in performance. Be very strict. The output must contain only json dictionary. Other sentences or opinion must not be in output."}, 76 | 77 | # X I/O PAIRS 78 | {"role": "user", "content":''}, 79 | {"role": "assistant", "content": ''}, 80 | 81 | {"role": "user", "content": text} 82 | ] 83 | ) 84 | prediction = response.choices[0].message.content.strip() 85 | output_filename = output_path + '/' + name 86 | try : 87 | json_data = json.loads(prediction) 88 | with open(output_filename + '.json', 'w', encoding='utf-8-sig') as json_file: 89 | json.dump(json_data, json_file, ensure_ascii = False, indent = 4) 90 | 91 | except : 92 | json_data = prediction 93 | with open(output_filename + '.txt', "w", encoding="utf-8-sig") as txt_file: 94 | txt_file.write(json_data) 95 | 96 | 97 | def prompt(messages) : 98 | response = client.chat.completions.create( 99 | model="gpt-4-1106-preview", 100 | temperature=0, 101 | frequency_penalty=0, 102 | presence_penalty=0, 103 | messages= messages) 104 | 105 | return messages, response.choices[0].message.content 106 | 107 | 108 | def zero_shot(input_path, output_path) : 109 | """ 110 | Test zero shot model, generates the prediction of table data extraction in json format. 111 | 112 | Parameters: 113 | input_path : TSV or JSON table representation path 114 | output_path : The path where the prediction is saved 115 | 116 | Returns: 117 | json : prediction of table data extraction 118 | """ 119 | 120 | client = OpenAI(api_key = 'YOUR OPENAI KEY') 121 | file_list = os.listdir(input_path) 122 | 123 | for file in file_list : 124 | data = {'question': [], 'answer': []} 125 | 126 | with open(input_path + file, 'r', encoding='utf-8') as file: 127 | table_representer = file.read() 128 | 129 | table_name = file.split('.')[0] 130 | 131 | instruction = "I'm going to convert the information in the table representer into JSON format.\n CATALYST_TEMPLATE = {'catalyst_name' : {'performance_name' : {PROPERTY_TEMPLATE}}\n PROPERTY_TEMPLATE = {'electrolyte': '', 'reaction_type': '', 'value': '', 'current_density': '', 'overpotential': '', 'potential': '','substrate': '', 'versus':''}\n performance_list = [overpotential, tafel_slope, Rct, stability, Cdl, onset_potential, current_density, potential, TOF, ECSA, water_splitting_potential, mass_activity, exchange_current_density, Rs, specific_activity, onset_overpotential, BET, surface_area, loading, apparent_activation_energy]\n. Table representer is in below \n\n " 132 | result = {"catalysts":[]} 133 | 134 | message_ = [{"role": "system", "content": instruction + table_representer}] 135 | 136 | catalyst_q = "Show the catalysts present in the table representer as a Python list. Answer must be ONLY python list. Not like '''python ''' Be very very very strict. Other sentences or explanation is not allowed.\n" 137 | question = catalyst_q 138 | message_.append({"role": "user", "content": question}) 139 | _, cata_answer = prompt(message_) 140 | catalyst_list = eval(cata_answer) 141 | data['question'].append(copy(message_)) 142 | data['answer'].append(cata_answer) 143 | 144 | message_.append({"role": "assistant", "content": cata_answer}) # 다음 prompt에 이전 답 추가 145 | 146 | for catalyst in catalyst_list : 147 | 148 | performance_template_q = "Create a CATALYST_TEMPLATE filling in the performance of {catalyst} from the table representer, strictly adhering to the following 3 rules:\n\n Rule 1: Only include the actual existing performances from the Performance_list in the CATALYST_TEMPLATE.\n Rule 2: Set all values of the keys in PROPERTY_TEMPLATE to be " ". DO NOT INSERT ANY VALUE. BE VERY STRICT.\n Rule 3: Answer must be ONLY json format. Only display the JSON (like string not ```json). Other sentences or explanation is not allowed.".format(catalyst="'''"+catalyst+"'''") 149 | question = performance_template_q 150 | message_.append({"role": "user", "content": question}) 151 | _, perfo_answer = prompt(message_) 152 | 153 | data['question'].append(copy(message_)) 154 | data['answer'].append(perfo_answer) 155 | 156 | message_.append({"role": "assistant", "content": perfo_answer}) 157 | property_q = 'In PROPERTY_TEMPLATE, maintain all keys, and fill in values that exist in the table representer. If there are more than two "values" for the same performance, fill in each "value" with the property template and make it into a list. If there is unit information, never create or modify additional keys, but reflect the units in the value.' 158 | question = property_q 159 | message_.append({"role": "user", "content": question}) 160 | _, property_answer1 = prompt(message_) 161 | 162 | data['question'].append(copy(message_)) 163 | data['answer'].append(property_answer1) 164 | 165 | message_.append({"role": "assistant", "content": property_answer1}) 166 | property_title_caption_q = "Modify the previous version of CATALYST_TEMPLATE based solely on the title, caption according to the rules below. Only refer to the title and caption part in table representer. Strictly adhere to the following rules. \n Rule 1: If there is reaction type information in title or caption, reflect the reaction type in previous version of CATALYST_TEMPLATE accordingly. But if there isn't reaction type information in title or caption part, leave CATALYST_TEMPLATE as previous version. Be strict. \n Rule 2: If there is electrolyte information in title or caption part, reflect the electrolyte in previous version of CATALYST_TEMPLATE. But if there isn't electrolyte information in title or caption part, leave CATALYST_TEMPLATE as previous version. Be strict. \n Rule 3: Never modify the keys. \n Rule 4: Never fill in values for any other keys except reaction_type, electrolyte. Never delete any other keys or value." 167 | question = property_title_caption_q 168 | message_.append({"role": "user", "content": question}) 169 | _, property_answer2 = prompt(message_) 170 | 171 | data['question'].append(copy(message_)) 172 | data['answer'].append(property_answer2) 173 | 174 | message_.append({"role": "assistant", "content": property_answer1}) 175 | delete_q ='Remove keys with no values from previous version of CATALYST_TEMPLATE.' 176 | question = delete_q 177 | message_.append({"role": "user", "content": question}) 178 | _, delete_answer = prompt(message_) 179 | 180 | data['question'].append(copy(message_)) 181 | data['answer'].append(delete_answer) 182 | 183 | catalyst_template = json.loads(delete_answer) 184 | result["catalysts"].append(catalyst_template) 185 | 186 | message_ = [{"role": "system", "content": instruction + table_representer}] 187 | message_.append({"role": "user", "content": catalyst_q}) 188 | message_.append({"role": "assistant", "content": cata_answer}) 189 | 190 | if len(result["catalysts"]) == 1 : 191 | final_result = result["catalysts"][0] 192 | 193 | elif len(result["catalysts"]) > 1 : 194 | final_result = result 195 | try : 196 | with open(output_path + table_name + ".json", "w") as json_file: 197 | json.dump(final_result, json_file, indent = 4) 198 | except : 199 | with open(output_path + table_name + ".txt", "w", encoding="utf-8-sig") as txt_file: 200 | txt_file.write(final_result) 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MaTableGPT: GPT-based Table Data Extractor from Materials Science Literature 2 | 3 | ## Introduction 4 | ### 1) Overall workflow of MaTableGPT 5 | 1. Generate customized TSV, JSON representation for HTML format of table and split the table. 6 | 2. Test 3 models (fine tuning, few shot, zero shot) and go through the follow up questions process. 7 | ![1](https://github.com/KIST-CSRC/CO2RR_NER/assets/171128050/7ca70729-84cc-4b4e-a93d-225f60f424a8) 8 | 9 | ### 2) GPT modeling process 10 | ![5](https://github.com/KIST-CSRC/MaTableGPT/assets/171128050/1bb4729f-bca3-4f82-9ab1-c0d93909c37a) 11 | 12 | ## User Manual 13 | ### 1) Installation 14 | 15 | **Using conda** 16 | ```bash 17 | conda env create -f requirements_conda.txt 18 | ``` 19 | **Using pip** 20 | ```bash 21 | pip install -r requirements_paper.txt 22 | ``` 23 | ### 2) Download data files 24 | ``` 25 | git clone https://github.com/KIST-CSRC/MaTableGPT.git 26 | git lfs pull 27 | ``` 28 | ## 3) Script architecture 29 | ``` 30 | MaTableGPT 31 | ├── data 32 | │ └── non_split 33 | │ └── split 34 | │ └── pickle_folder 35 | │ └── result 36 | ├── GPT_models 37 | │ └── models.py 38 | │ └── follow_up_q.py.py 39 | ├── model_evaluation 40 | │ └── utils 41 | │ └── evaluation.py 42 | ├── table_representation 43 | │ └── table_representer.py 44 | │ └── table2json.py 45 | ├── table_splitting 46 | │ └── split_table.py 47 | │ 48 | └── run.py 49 | ``` 50 | ### 4) Code usage (run.py) 51 | **Examples : Input generation (split, tsv)** 52 | > ```python 53 | > input_guneration("split", "TSV") 54 | > ``` 55 | 56 | **Examples : Data extraction (few shot, follow_up questions)** 57 | > ```python 58 | > model_test("few_shot", True) 59 | > ``` 60 | ## Benefit 61 | Using MaTableGPT, we achieved a table data extraction accuracy of 96.8% and proposed the optimal solution for each situation through the Pareto-front solution. 62 | ## Reference 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /data/non_split/table_html/example_tbl01.html: -------------------------------------------------------------------------------- 1 | 2 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-BPDC/Co-BDC heterojunction1 M KOH33572.180this work
Co-BDC1 M KOH39277.228this work
Co-BPDC1 M KOH42878.828this work
Co/MIL-101(Cr)-O0.1 M KOH57017 (40)
Fe2Ni-BPTC/CC0.1 M KOH36577.215 (41)
UTSA-161 M KOH408777 (42)
2D Co–MOF UNS1 M KOH263743.3 (18)
Co-OBA/C0.1 M KOH59085.7 (43)
Co2(μ–OH)2(bbta)1 M KOH3876024 (44)
3D Gr/Ni-MOF0.1 M KOH3709120 (25)
Co0.6Fe0.4-MOF-741 M KOH2805612 (3)
Ti3C2Tx-CoBDC0.1 M KOH41048.22.8 (23)
Co-MOF1 M KOH36089 (45)
" 3 | -------------------------------------------------------------------------------- /data/non_split/table_json/example_tbl01.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-BPDC/Co-BDC heterojunction1 M KOH33572.180this work
Co-BDC1 M KOH39277.228this work
Co-BPDC1 M KOH42878.828this work
Co/MIL-101(Cr)-O0.1 M KOH57017 (40)
Fe2Ni-BPTC/CC0.1 M KOH36577.215 (41)
UTSA-161 M KOH408777 (42)
2D Co–MOF UNS1 M KOH263743.3 (18)
Co-OBA/C0.1 M KOH59085.7 (43)
Co2(μ–OH)2(bbta)1 M KOH3876024 (44)
3D Gr/Ni-MOF0.1 M KOH3709120 (25)
Co0.6Fe0.4-MOF-741 M KOH2805612 (3)
Ti3C2Tx-CoBDC0.1 M KOH41048.22.8 (23)
Co-MOF1 M KOH36089 (45)
" 5 | } -------------------------------------------------------------------------------- /data/pickle_folder/example_tbl01.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/data/pickle_folder/example_tbl01.pickle -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_01.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "Co-BPDC/Co-BDC heterojunction" 5 | ], 6 | "electrolyte": [ 7 | "1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 335 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 72.1 14 | ], 15 | "durability (h)": [ 16 | 80 17 | ], 18 | "ref": [ 19 | "this work" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_02.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "Co-BDC" 5 | ], 6 | "electrolyte": [ 7 | "1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 392 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 77.2 14 | ], 15 | "durability (h)": [ 16 | 28 17 | ], 18 | "ref": [ 19 | "this work" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_03.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "Co-BPDC" 5 | ], 6 | "electrolyte": [ 7 | "1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 428 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 78.8 14 | ], 15 | "durability (h)": [ 16 | 28 17 | ], 18 | "ref": [ 19 | "this work" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_04.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "Co/MIL-101(Cr)-O" 5 | ], 6 | "electrolyte": [ 7 | "0.1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 570 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 17 14 | ], 15 | "durability (h)": [ 16 | "\u2013" 17 | ], 18 | "ref": [ 19 | "(40)" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_05.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "Fe2Ni-BPTC/CC" 5 | ], 6 | "electrolyte": [ 7 | "0.1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 365 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 77.2 14 | ], 15 | "durability (h)": [ 16 | 15 17 | ], 18 | "ref": [ 19 | "(41)" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_06.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "UTSA-16" 5 | ], 6 | "electrolyte": [ 7 | "1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 408 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 77 14 | ], 15 | "durability (h)": [ 16 | 7 17 | ], 18 | "ref": [ 19 | "(42)" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_07.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "2D Co\u2013MOF UNS" 5 | ], 6 | "electrolyte": [ 7 | "1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 263 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 74 14 | ], 15 | "durability (h)": [ 16 | 3.3 17 | ], 18 | "ref": [ 19 | "(18)" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_08.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "Co-OBA/C" 5 | ], 6 | "electrolyte": [ 7 | "0.1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 590 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 85.7 14 | ], 15 | "durability (h)": [ 16 | "\u2013" 17 | ], 18 | "ref": [ 19 | "(43)" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_09.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "Co2(\u03bc\u2013OH)2(bbta)" 5 | ], 6 | "electrolyte": [ 7 | "1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 387 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 60 14 | ], 15 | "durability (h)": [ 16 | 24 17 | ], 18 | "ref": [ 19 | "(44)" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_10.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "3D Gr/Ni-MOF" 5 | ], 6 | "electrolyte": [ 7 | "0.1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 370 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 91 14 | ], 15 | "durability (h)": [ 16 | 20 17 | ], 18 | "ref": [ 19 | "(25)" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_11.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "Co0.6Fe0.4-MOF-74" 5 | ], 6 | "electrolyte": [ 7 | "1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 280 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 56 14 | ], 15 | "durability (h)": [ 16 | 12 17 | ], 18 | "ref": [ 19 | "(3)" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_12.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "Ti3C2Tx-CoBDC" 5 | ], 6 | "electrolyte": [ 7 | "0.1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 410 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 48.2 14 | ], 15 | "durability (h)": [ 16 | 2.8 17 | ], 18 | "ref": [ 19 | "(23)" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/json_representation/example_tbl01_13.json: -------------------------------------------------------------------------------- 1 | { 2 | "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "catalyst": [ 4 | "Co-MOF" 5 | ], 6 | "electrolyte": [ 7 | "1 M KOH" 8 | ], 9 | "\u03b7 at 10 mA\u00a0cm\u20132 (mV)": [ 10 | 360 11 | ], 12 | "Tafel slope (mV/decade)": [ 13 | 89 14 | ], 15 | "durability (h)": [ 16 | "\u2013" 17 | ], 18 | "ref": [ 19 | "(45)" 20 | ], 21 | "caption": "" 22 | } -------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_01.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-BPDC/Co-BDC heterojunction 1 M KOH 335 72.1 80 this work
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_02.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-BDC 1 M KOH 392 77.2 28 this work
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_03.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-BPDC 1 M KOH 428 78.8 28 this work
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_04.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co/MIL-101(Cr)-O 0.1 M KOH 570 17 (40)
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_05.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Fe2Ni-BPTC/CC 0.1 M KOH 365 77.2 15 (41)
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_06.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
UTSA-16 1 M KOH 408 77 7 (42)
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_07.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
2D Co–MOF UNS 1 M KOH 263 74 3.3 (18)
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_08.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-OBA/C 0.1 M KOH 590 85.7 (43)
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_09.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co2(μ–OH)2(bbta) 1 M KOH 387 60 24 (44)
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_10.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
3D Gr/Ni-MOF 0.1 M KOH 370 91 20 (25)
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_11.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co0.6Fe0.4-MOF-74 1 M KOH 280 56 12 (3)
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_12.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Ti3C2Tx-CoBDC 0.1 M KOH 410 48.2 2.8 (23)
-------------------------------------------------------------------------------- /data/split/table_split_html/example_tbl01_13.html: -------------------------------------------------------------------------------- 1 |
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-MOF 1 M KOH 360 89 (45)
-------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_01.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-BPDC/Co-BDC heterojunction 1 M KOH 335 72.1 80 this work
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_02.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-BDC 1 M KOH 392 77.2 28 this work
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_03.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-BPDC 1 M KOH 428 78.8 28 this work
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_04.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co/MIL-101(Cr)-O 0.1 M KOH 570 17 (40)
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_05.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Fe2Ni-BPTC/CC 0.1 M KOH 365 77.2 15 (41)
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_06.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
UTSA-16 1 M KOH 408 77 7 (42)
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_07.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
2D Co–MOF UNS 1 M KOH 263 74 3.3 (18)
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_08.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-OBA/C 0.1 M KOH 590 85.7 (43)
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_09.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co2(μ–OH)2(bbta) 1 M KOH 387 60 24 (44)
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_10.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
3D Gr/Ni-MOF 0.1 M KOH 370 91 20 (25)
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_11.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co0.6Fe0.4-MOF-74 1 M KOH 280 56 12 (3)
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_12.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Ti3C2Tx-CoBDC 0.1 M KOH 410 48.2 2.8 (23)
" 5 | } -------------------------------------------------------------------------------- /data/split/table_split_json/example_tbl01_13.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts", 3 | "caption": "", 4 | "tag": "
catalystelectrolyteη at 10 mA cm–2 (mV)Tafel slope (mV/decade)durability (h)ref
Co-MOF 1 M KOH 360 89 (45)
" 5 | } -------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_01.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo-BPDC/Co-BDC heterojunction\t1 M KOH\t335\t72.1\t80\tthis work\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_02.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo-BDC\t1 M KOH\t392\t77.2\t28\tthis work\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_03.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo-BPDC\t1 M KOH\t428\t78.8\t28\tthis work\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_04.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo/MIL-101(Cr)-O\t0.1 M KOH\t570\t17\t–\t (40)\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_05.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nFe2Ni-BPTC/CC\t0.1 M KOH\t365\t77.2\t15\t (41)\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_06.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nUTSA-16\t1 M KOH\t408\t77\t7\t (42)\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_07.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\n2D Co–MOF UNS\t1 M KOH\t263\t74\t3.3\t (18)\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_08.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo-OBA/C\t0.1 M KOH\t590\t85.7\t–\t (43)\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_09.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo2(μ–OH)2(bbta)\t1 M KOH\t387\t60\t24\t (44)\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_10.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\n3D Gr/Ni-MOF\t0.1 M KOH\t370\t91\t20\t (25)\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_11.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo0.6Fe0.4-MOF-74\t1 M KOH\t280\t56\t12\t (3)\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_12.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nTi3C2Tx-CoBDC\t0.1 M KOH\t410\t48.2\t2.8\t (23)\t\n
-------------------------------------------------------------------------------- /data/split/tsv_representation/example_tbl01_13.txt: -------------------------------------------------------------------------------- 1 | Table 1. Comparison of OER Performance for Several MOFs Electrocatalystscatalyst\telectrolyte\tη at 10 mA cm–2 (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo-MOF\t1 M KOH\t360\t89\t–\t (45)\t\n
-------------------------------------------------------------------------------- /input_generation_script.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_generator": { 3 | "splitting_HTML": { 4 | "split": [ 5 | { 6 | "input_JSON_path": "./data/table_json", 7 | "input_pickle_path": "./data/pickle_folder", 8 | "input_HTML_path": "./data/table_html", 9 | "output_HTML_path": "./data/html_table_split_result" 10 | }, 11 | { 12 | "table_representation": { 13 | "JSON": { 14 | "input_path": "./data/split/table_split_json/", 15 | "output_path": "./data/split/json_representation" 16 | }, 17 | "TSV": { 18 | "input_path": "./data/split/table_split_json/", 19 | "output_path": "./data/split/tsv_representation" 20 | } 21 | } 22 | } 23 | ], 24 | "non_split": { 25 | "table_representation": { 26 | "JSON": { 27 | "input_path": "./data/non_split/table_json/", 28 | "output_path": "./data/non_split/json_representation" 29 | }, 30 | "TSV": { 31 | "input_path": "./data/non_split/table_json/", 32 | "output_path": "./data/non_split/tsv_representation" 33 | } 34 | } 35 | } 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /model_evaluation/evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | import re 5 | import unicodedata 6 | from utils.functions import * 7 | 8 | class evaluation : 9 | def __init__(self, prediction_path, groundTruth_path) : 10 | self.prediction_path = prediction_path 11 | self.groundTruth_path = groundTruth_path 12 | 13 | 14 | def remove_whitespace_from_keys(self, data): 15 | ''' 16 | remove whitespace from keys 17 | ''' 18 | if isinstance(data, dict): 19 | data = {key.replace("_", ""): self.remove_whitespace_from_keys(value) for key, value in data.items()} 20 | if isinstance(data, dict): 21 | return {key.replace(" ", ""): self.remove_whitespace_from_keys(value) for key, value in data.items()} 22 | elif isinstance(data, list): 23 | return [self.remove_whitespace_from_keys(item) for item in data] 24 | else: 25 | return data 26 | 27 | def remove_unicode(self, text): 28 | ''' 29 | remove unicode 30 | ''' 31 | 32 | return ''.join(char for char in unicodedata.normalize('NFKD', text) if not unicodedata.combining(char)) 33 | 34 | 35 | def remove_unicode_version(self,data) : 36 | processed_data = {} 37 | for key, value in data.items(): 38 | key = self.remove_unicode(key) 39 | if isinstance(value, str): 40 | processed_data[key] = self.remove_unicode(value) 41 | else: 42 | processed_data[key] = value 43 | 44 | return processed_data 45 | 46 | def unicode_to_str(self, match): 47 | unicode_str = match.group() 48 | return bytes(unicode_str, 'utf-8').decode('unicode_escape') 49 | 50 | def load_data(self) : 51 | ''' 52 | load prediction and ground truth 53 | ''' 54 | with open(self.prediction_path, 'r', encoding='utf-8-sig') as file: 55 | prediction = json.load(file) 56 | 57 | with open(self.groundTruth_path, 'r', encoding='utf-8-sig') as file: 58 | ground_truth = json.load(file) 59 | 60 | prediction = json.dumps(prediction, ensure_ascii=False) 61 | 62 | cleaned_string = re.sub(r'–', '-', prediction) 63 | cleaned_string = re.sub(r'−', '-', cleaned_string) 64 | cleaned_string = re.sub(r'', '', cleaned_string) 65 | cleaned_string = re.sub(r'', '', cleaned_string) 66 | cleaned_string = re.sub(r'\\u[0-9a-fA-F]{4}', self.unicode_to_str, cleaned_string) 67 | cleaned_string = re.sub(r'�', '±', cleaned_string) 68 | cleaned_string = re.sub(r'm2 g−1', 'm2/g', cleaned_string) 69 | cleaned_string = re.sub(r'μF cm−2', 'μF/cm2', cleaned_string) 70 | cleaned_string = re.sub(r'mA mg−2', 'mA/mg2', cleaned_string) 71 | cleaned_string = re.sub(r'mA mg−1', 'mA/mg', cleaned_string) 72 | cleaned_string = re.sub(r'mA cm−2', 'mA/cm2', cleaned_string) 73 | cleaned_string = re.sub(r'ohm', 'ω', cleaned_string) 74 | cleaned_string = re.sub(r'~', '∼', cleaned_string) 75 | cleaned_string = re.sub(r'', '', cleaned_string) 76 | cleaned_string = re.sub(r'²', '2', cleaned_string) 77 | cleaned_string = re.sub(r'\u2005', ' ', cleaned_string) 78 | cleaned_string = re.sub(r'\u2006', ' ', cleaned_string) 79 | cleaned_string = re.sub(r'\u2009', ' ', cleaned_string) 80 | cleaned_string = re.sub(r'\u200b', ' ', cleaned_string) 81 | cleaned_string = re.sub(r'\u202f', ' ', cleaned_string) 82 | cleaned_string = re.sub(r'\u200e', ' ', cleaned_string) 83 | cleaned_string = re.sub(r'0\s+0', '0,0', cleaned_string) 84 | cleaned_string = re.sub(r'(\d+)∼(\d+)', r'\1-\2', cleaned_string) 85 | cleaned_string = re.sub(r' ', '', cleaned_string) 86 | cleaned_string = re.sub(r'·', '', cleaned_string) 87 | cleaned_string = re.sub(r'fcm−2', 'f/cm2', cleaned_string) 88 | cleaned_string = cleaned_string.lower() 89 | cleaned_string = re.sub(r'jecsa', 'ecsa', cleaned_string) 90 | cleaned_string = re.sub(r'j-ecsa', 'ecsa', cleaned_string) 91 | cleaned_string = re.sub(r'ag-1', 'a/g', cleaned_string) 92 | cleaned_string = re.sub(r'ours', 'thiswork', cleaned_string) 93 | cleaned_string = re.sub(r'0\.m', '0m', cleaned_string) 94 | cleaned_string = re.sub(r';', '', cleaned_string) 95 | cleaned_string = re.sub(r'\.$', '', cleaned_string) 96 | prediction = json.loads(cleaned_string) 97 | 98 | ground_truth = json.dumps(ground_truth, ensure_ascii=False) 99 | cleaned_string = re.sub(r'–', '-', ground_truth) 100 | cleaned_string = re.sub(r'−', '-', cleaned_string) 101 | cleaned_string = re.sub(r'', '', cleaned_string) 102 | cleaned_string = re.sub(r'', '', cleaned_string) 103 | cleaned_string = re.sub(r'\\u[0-9a-fA-F]{4}', self.unicode_to_str, cleaned_string) 104 | cleaned_string = re.sub(r'�', '±', cleaned_string) 105 | cleaned_string = re.sub(r'm2 g−1', 'm2/g', cleaned_string) 106 | cleaned_string = re.sub(r'μF cm−2', 'μF/cm2', cleaned_string) 107 | cleaned_string = re.sub(r'mA mg−2', 'mA/mg2', cleaned_string) 108 | cleaned_string = re.sub(r'mA mg−1', 'mA/mg', cleaned_string) 109 | cleaned_string = re.sub(r'mA cm−2', 'mA/cm2', cleaned_string) 110 | cleaned_string = re.sub(r'ohm', 'ω', cleaned_string) 111 | cleaned_string = re.sub(r'~', '∼', cleaned_string) 112 | cleaned_string = re.sub(r'²', '2', cleaned_string) 113 | cleaned_string = re.sub(r'\u2005', ' ', cleaned_string) 114 | cleaned_string = re.sub(r'\u2006', ' ', cleaned_string) 115 | cleaned_string = re.sub(r'\u2009', ' ', cleaned_string) 116 | cleaned_string = re.sub(r'\u200b', ' ', cleaned_string) 117 | cleaned_string = re.sub(r'\u202f', ' ', cleaned_string) 118 | cleaned_string = re.sub(r'\u200e', ' ', cleaned_string) 119 | cleaned_string = re.sub(r'0\s+0', '0,0', cleaned_string) 120 | cleaned_string = re.sub(r'(\d+)∼(\d+)', r'\1-\2', cleaned_string) 121 | cleaned_string = re.sub(r' ', '', cleaned_string) 122 | cleaned_string = re.sub(r'·', '', cleaned_string) 123 | cleaned_string = re.sub(r'fcm−2', 'f/cm2', cleaned_string) 124 | cleaned_string = cleaned_string.lower() 125 | cleaned_string = re.sub(r'jecsa', 'ecsa', cleaned_string) 126 | cleaned_string = re.sub(r'j-ecsa', 'ecsa', cleaned_string) 127 | cleaned_string = re.sub(r'ag-1', 'a/g', cleaned_string) 128 | cleaned_string = re.sub(r'ours', 'thiswork', cleaned_string) 129 | cleaned_string = re.sub(r'0\.m', '0m', cleaned_string) 130 | cleaned_string = re.sub(r';', '', cleaned_string) 131 | cleaned_string = re.sub(r'\.$', '', cleaned_string) 132 | 133 | ground_truth = json.loads(cleaned_string) 134 | prediction = self.remove_unicode_version(prediction) 135 | ground_truth = self.remove_unicode_version(ground_truth) 136 | 137 | prediction = self.remove_whitespace_from_keys(prediction) 138 | ground_truth = self.remove_whitespace_from_keys(ground_truth) 139 | 140 | return prediction, ground_truth 141 | 142 | def get_key_list_with_value(self) : 143 | ''' 144 | return list that contain key, value sets 145 | ''' 146 | prediction, ground_truth = self.load_data() 147 | pr_list = get_keys(prediction, parent_key = '', sep = '//') 148 | gt_list = get_keys(ground_truth, parent_key = '', sep = '//') 149 | 150 | return pr_list, gt_list 151 | 152 | def merging(self) : 153 | ''' 154 | combine dict that have same catalyst name 155 | ''' 156 | pr_list, gt_list = self.get_key_list_with_value() 157 | prediction, ground_truth = self.load_data() 158 | first_key_value_p = prediction[next(iter(prediction))] 159 | 160 | if isinstance(first_key_value_p, list): 161 | 162 | first_dict_p = first_key_value_p[0] 163 | first_value_p = self.first_key(first_dict_p) 164 | 165 | if first_value_p != { } : 166 | try : 167 | if dupl_catalyst(pr_list) : 168 | prediction_ = merging_result(first_key_value_p, pr_list) 169 | prediction = {} 170 | prediction["catalysts"] = prediction_ 171 | except : 172 | prediction = prediction 173 | else : 174 | prediction = prediction 175 | else : 176 | prediction = prediction 177 | 178 | first_key_value_g = ground_truth[next(iter(ground_truth))] 179 | 180 | if isinstance(first_key_value_g, list): 181 | 182 | first_dict_g = first_key_value_g[0] 183 | first_value_g = self.first_key(first_dict_g) 184 | if first_value_g != { } : 185 | 186 | try : 187 | if dupl_catalyst(gt_list) : 188 | 189 | ground_truth_ = merging_result(first_key_value_g, gt_list) 190 | ground_truth = {} 191 | ground_truth["catalysts"] = ground_truth_ 192 | 193 | 194 | except : 195 | ground_truth = ground_truth 196 | else : 197 | ground_truth = ground_truth 198 | else : 199 | ground_truth = ground_truth 200 | 201 | return prediction, ground_truth 202 | 203 | 204 | def first_key(self, dict) : 205 | ''' 206 | finding first key of dict 207 | ''' 208 | first_key = next(iter(dict)) 209 | 210 | first_value = dict[first_key] 211 | return first_value 212 | 213 | def get_key_list_with_value_for_structure(self) : 214 | ''' 215 | combine dict that have same catalyst name 216 | ''' 217 | prediction, ground_truth = self.merging() 218 | 219 | pr_list = get_keys(prediction, parent_key = '', sep = '//') 220 | gt_list = get_keys(ground_truth, parent_key = '', sep = '//') 221 | 222 | return pr_list, gt_list 223 | 224 | def run(self) : 225 | ''' 226 | calculate structure F1 score 227 | ''' 228 | TP = [] 229 | FP = [] 230 | FN = [] 231 | corrected = [] 232 | incorrected = [] 233 | 234 | pr_list, gt_list= self.get_key_list_with_value_for_structure() 235 | 236 | pr_list = [item for item in pr_list if 'condition' not in item] 237 | gt_list = [item for item in gt_list if 'condition' not in item] 238 | 239 | structure_pr = [] 240 | structure_gt = [] 241 | for i in pr_list : 242 | if '****' in i : 243 | structure_pr.append(i.split("****")[0]) 244 | else : 245 | if i != '' : 246 | structure_pr.append(i) 247 | 248 | for i in gt_list : 249 | if '****' in i : 250 | structure_gt.append(i.split("****")[0]) 251 | else : 252 | if i != '' : 253 | structure_gt.append(i) 254 | 255 | 256 | f1_pr = add_indices_to_duplicates(structure_pr) 257 | f1_gt = add_indices_to_duplicates(structure_gt) 258 | 259 | for vv in f1_pr : 260 | if vv in f1_gt : 261 | TP.append(vv) 262 | 263 | if vv not in f1_gt : 264 | FP.append(vv) 265 | 266 | for pp in f1_gt : 267 | if pp not in f1_pr : 268 | FN.append(pp) 269 | f1_score_l = [] 270 | 271 | f1_score = (len(TP) / (len(TP) + (1/2)*(len(FP) + len(FN)))) 272 | 273 | print(self.prediction_path.split('/')[-1]) 274 | print(f1_score) 275 | return f1_score, len(TP) 276 | 277 | def run2(self) : 278 | ''' 279 | calculate value accuracy 280 | ''' 281 | print(self.groundTruth_path) 282 | 283 | corrected = [] 284 | incorrected = [] 285 | prediction, ground_truth = self.merging() 286 | 287 | pr_list = get_keys_for_value_accuracy(prediction) 288 | gt_list = get_keys_for_value_accuracy(ground_truth) 289 | 290 | pr_list = [item for item in pr_list if 'condition' not in item] 291 | gt_list = [item for item in gt_list if 'condition' not in item] 292 | 293 | pr_value = [] 294 | gt_value = [] 295 | 296 | for i in pr_list : 297 | if '****' in i : 298 | pr_value.append(i) 299 | for j in gt_list : 300 | if '****' in j : 301 | gt_value.append(j) 302 | 303 | sep_pr_list = seperate_key_value(pr_value) 304 | sep_gt_list = seperate_key_value(gt_value) 305 | 306 | str_val_valset_pr = str_val_valset_split(sep_pr_list) 307 | str_val_valset_gt = str_val_valset_split(sep_gt_list) 308 | 309 | compare_gt = group_by_first_element(str_val_valset_gt) 310 | compare_pr = group_by_first_element(str_val_valset_pr) 311 | 312 | total = 0 313 | if not compare_gt and not compare_pr : 314 | 315 | corrected = [1] 316 | total = 1 317 | 318 | 319 | elif not compare_gt and compare_pr : 320 | right = len(gt_value) - len(pr_value) 321 | worng = len(pr_value) 322 | for r in range(0, right) : 323 | corrected.append(r) 324 | for w in range(0, wrong) : 325 | incorrected.append(w) 326 | 327 | elif compare_gt and compare_pr : 328 | for c_pr in compare_pr : 329 | for c_gt in compare_gt : 330 | 331 | if c_gt[0][0] in flatten_list(c_pr): 332 | 333 | if len(c_gt) == len(c_pr) : 334 | total = total + len(c_gt) 335 | elif len(c_gt) < len(c_pr) : 336 | total = total + len(c_gt) 337 | elif len(c_gt) > len(c_pr) : 338 | total = total + len(c_pr) 339 | 340 | if len(c_gt) == len(c_pr) and len(c_gt) == 1 : 341 | if c_gt[0][1] == c_pr[0][1] : 342 | corrected.append(c_gt) 343 | else : 344 | incorrected.append(c_gt) 345 | else: 346 | gt_valset = [] 347 | pr_valset = [] 348 | 349 | for gt_unit in c_gt : 350 | if gt_unit[-1] == '': 351 | gt_unit[-1] = gt_unit[1] 352 | gt_valset.append(gt_unit) 353 | gt_valset.append(gt_unit[-1].split('++')) 354 | 355 | for pr_unit in c_pr : 356 | if pr_unit[-1] == '': 357 | pr_unit[-1] = pr_unit[1] 358 | pr_valset.append(pr_unit) 359 | else : 360 | pr_valset.append(pr_unit[-1].split('++')) 361 | 362 | while not all_element_int(gt_valset) : 363 | pair = (-1,-1) 364 | max_dupl = float('-inf') 365 | for index_gt, value_gt in enumerate(gt_valset) : 366 | for index_pr, value_pr in enumerate(pr_valset) : 367 | if finding_pair(value_gt, value_pr) > 0 : 368 | if finding_pair(value_gt, value_pr) > max_dupl : 369 | max_dupl = finding_pair(value_gt, value_pr) 370 | pair = (index_gt, index_pr) 371 | 372 | if pair != (-1, -1): 373 | lenlen = len(gt_valset[index_gt]) 374 | 375 | if c_gt[pair[0]][1] == c_pr[pair[1]][1] : 376 | corrected.append(c_gt[pair[0]][1]) 377 | 378 | else : 379 | incorrected.append(c_gt[pair[0]][1]) 380 | 381 | gt_valset[pair[0]] = [random.randint(0, 1000000) for _ in range(lenlen)] 382 | pr_valset[pair[1]] = [random.randint(0, 1000000) for _ in range(lenlen)] 383 | 384 | 385 | c_gt[pair[0]] = [random.randint(0, 1000000) for _ in range(lenlen)] 386 | c_pr[pair[1]] = [random.randint(0, 1000000) for _ in range(lenlen)] 387 | 388 | else : 389 | break 390 | 391 | total_value_accuracy = len(corrected) / total 392 | return total_value_accuracy 393 | 394 | 395 | 396 | 397 | 398 | if __name__ == '__main__': 399 | 400 | prediction_folder = 'PREDICTION FOLDER PATH' 401 | groundTruth_folder = 'GROUND TURTH FOLDER PATH' 402 | 403 | 404 | test_list = os.listdir(prediction_folder) 405 | value_ = [] 406 | error = [] 407 | 408 | # CALCULATE STRUCTURE F1 SCORE 409 | for file in test_list : 410 | try : 411 | prediction_path = prediction_folder + '/' + file 412 | groundTruth_path = groundTruth_folder+ '/' + file 413 | 414 | score = evaluation(prediction_path, groundTruth_path) 415 | # ##################### 벨류 평가 ###################### 416 | # value = score.run2() 417 | # if value != 0 : 418 | # value_.append(value) 419 | 420 | value, _ = score.run() 421 | value_.append(value) 422 | 423 | except : 424 | value_.append(0) 425 | error.append(file) 426 | 427 | final_value = (sum(value_)) / len(value_) 428 | print('===========final value===========') 429 | print(final_value) 430 | 431 | 432 | # CALCULATE VALUE ACCURACY 433 | for file in test_list : 434 | prediction_path = prediction_folder + '/' + file 435 | groundTruth_path = groundTruth_folder+ '/' + file 436 | 437 | score = evaluation(prediction_path, groundTruth_path) 438 | value, _ = score.run() 439 | value_.append(value) 440 | 441 | final_value = (sum(value_)) / len(value_) 442 | print('===========final value===========') 443 | print(final_value) 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | # prediction_folder = 'C:/NLP/TableProject/ssss_v2' 463 | # groundTruth_folder = 'C:/NLP/TableProject/ssss_v2' 464 | 465 | # # ********************************************************************************************* 466 | 467 | # test_list = os.listdir(prediction_folder) 468 | # value_ = [] 469 | # error = [] 470 | # # for file in test_list : 471 | 472 | # # gy_file = file.split('.')[0] + '_converted.json' 473 | 474 | # prediction_path = prediction_folder + '/pr_Elsevier_OER_05189_tbl05.json' 475 | # groundTruth_path = groundTruth_folder+ '/gt_Elsevier_OER_05189_tbl05.json' 476 | # # prediction_path = prediction_folder + '/' + file 477 | # # groundTruth_path = groundTruth_folder+ '/' + file 478 | # score = evaluation(prediction_path, groundTruth_path) 479 | # ##################### 벨류 평가 ###################### 480 | # value = score.run2() 481 | # # if value != 0 : 482 | # # value_.append(value) 483 | 484 | 485 | 486 | # # final_value = (sum(value_)) / len(value_) 487 | # # print('===========final value===========') 488 | # # print(final_value) 489 | # # print(error) -------------------------------------------------------------------------------- /model_evaluation/utils/__pycache__/functions.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/model_evaluation/utils/__pycache__/functions.cpython-37.pyc -------------------------------------------------------------------------------- /model_evaluation/utils/__pycache__/get_keys.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/model_evaluation/utils/__pycache__/get_keys.cpython-37.pyc -------------------------------------------------------------------------------- /model_evaluation/utils/__pycache__/get_keys_function.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/model_evaluation/utils/__pycache__/get_keys_function.cpython-37.pyc -------------------------------------------------------------------------------- /model_evaluation/utils/__pycache__/get_keys_function.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/model_evaluation/utils/__pycache__/get_keys_function.cpython-38.pyc -------------------------------------------------------------------------------- /model_evaluation/utils/functions.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import re 3 | import json 4 | 5 | def get_keys(d, parent_key='', sep='//'): 6 | keys = [] 7 | key_list2 = [] 8 | for k, v in d.items(): 9 | new_key = parent_key+sep+k if parent_key else k 10 | 11 | if isinstance(v, list): 12 | if k == 'ref' : 13 | new_key = new_key + '****' + str(v) 14 | keys.append(new_key) 15 | 16 | else : 17 | keys.append(new_key) 18 | for i in v : 19 | if type(i) != str and type(i) != float and type(i) != int: 20 | keys.extend(get_keys(i, new_key, sep=sep)) 21 | else : 22 | new_key = new_key 23 | if isinstance(v, dict): 24 | keys.append(new_key) 25 | keys.extend(get_keys(v, new_key, sep=sep)) 26 | 27 | if type(v) == str or type(v) == float or type(v) == int: 28 | new_key = new_key + '****' + str(v) 29 | keys.append(new_key) 30 | for j in keys : 31 | if j.split("//")[0] == "catalysts" or j.split("//")[0] == "catalyst" : 32 | j = j.split('//') 33 | j = "//".join(j[1:]) 34 | key_list2.append(j) 35 | else : 36 | key_list2.append(j) 37 | 38 | return key_list2 39 | 40 | def add_indices_to_duplicates(input_list): 41 | index_dict = defaultdict(list) 42 | 43 | for index, item in enumerate(input_list): 44 | index_dict[item].append(index) 45 | 46 | output_list = [] 47 | 48 | for item, indices in index_dict.items(): 49 | if len(indices) > 1: 50 | for i, index in enumerate(indices, 1): 51 | modified_item = item.replace('//', f'//(index{i})') 52 | else: 53 | modified_item = item 54 | output_list.append(modified_item) 55 | else: 56 | output_list.append(item) 57 | 58 | return output_list 59 | 60 | def contains_list(data): 61 | if isinstance(data, list): 62 | return True 63 | if isinstance(data, dict): 64 | for key, value in data.items(): 65 | if contains_list(value): 66 | return True 67 | return False 68 | 69 | def get_keys_for_value_accuracy(d, parent_key='', sep='//'): 70 | keys = [] 71 | key_list2 = [] 72 | for k, v in d.items(): 73 | new_key = parent_key + sep + k if parent_key else k 74 | 75 | if isinstance(v, list): 76 | keys.append(new_key) 77 | for i in v: 78 | if contains_list(i) == False : 79 | new_list = [] 80 | value_list = [] 81 | for performance_property, performance_value in i.items(): 82 | if performance_property != 'condition' : 83 | if isinstance(performance_value, str): 84 | 85 | value_list.append(performance_value) 86 | 87 | elif isinstance(performance_value, dict): 88 | for p_p, p_v in performance_value.items(): 89 | if isinstance(p_v, str): 90 | 91 | value_list.append(p_v) 92 | 93 | 94 | total_value = '(('+'++'.join(value_list) + '))' 95 | 96 | new_dict = {} 97 | for kkk, vvv in i.items(): 98 | if isinstance(vvv, str): 99 | new_dict[kkk+total_value] = vvv 100 | elif isinstance(vvv, dict): 101 | kkk +=total_value 102 | new_dict[kkk] = {} 103 | 104 | for kk, vv in vvv.items(): 105 | new_dict[kkk][kk+total_value] = vv 106 | new_list.append(new_dict) 107 | changed_dict = new_dict 108 | 109 | keys.extend(get_keys_for_value_accuracy(changed_dict, new_key, sep=sep)) 110 | 111 | if contains_list(i) ==True : # 112 | keys.extend(get_keys_for_value_accuracy(i, new_key, sep=sep)) 113 | 114 | 115 | if isinstance(v, dict): 116 | keys.append(new_key) 117 | keys.extend(get_keys_for_value_accuracy(v, new_key, sep=sep)) 118 | 119 | if type(v) == str: 120 | new_key = new_key + '****' + v 121 | keys.append(new_key) 122 | 123 | for j in keys: 124 | if j.split("//")[0] == "catalysts" or j.split("//")[0] == "catalyst": 125 | j = j.split('//') 126 | j = "//".join(j[1:]) 127 | key_list2.append(j) 128 | else: 129 | key_list2.append(j) 130 | 131 | return key_list2 132 | 133 | def seperate_key_value(input_list): 134 | pattern = re.compile(r'\(\((.*?)\)\)') 135 | new_list = [] 136 | for item in input_list: 137 | match = pattern.search(item) 138 | if match: 139 | content = match.group(1) 140 | result = re.sub(pattern, '', item) 141 | new_list.append([result, content]) 142 | else: 143 | new_list.append([item]) 144 | 145 | return new_list 146 | 147 | def remove_whitespace_from_keys(data): 148 | if isinstance(data, dict): 149 | return {key.replace(" ", ""): remove_whitespace_from_keys(value) for key, value in data.items()} 150 | elif isinstance(data, list): 151 | return [remove_whitespace_from_keys(item) for item in data] 152 | else: 153 | return data 154 | 155 | def str_val_valset_split(list) : 156 | new_list = [] 157 | for item in list: 158 | a = [] 159 | for i in item : 160 | substrings = i.split('****') 161 | a.append(substrings[0]) 162 | 163 | if len(substrings) > 1: 164 | a.extend(substrings[1].split('++')) 165 | 166 | new_list.append(a) 167 | 168 | return new_list 169 | 170 | def group_by_first_element(list1): 171 | result = {} 172 | for sublist in list1: 173 | key = sublist[0] 174 | if key in result: 175 | result[key].append(sublist) 176 | else: 177 | result[key] = [sublist] 178 | return list(result.values()) 179 | 180 | 181 | def finding_pair(list1, list2) : 182 | intersection = len(set(list1) & set(list2)) 183 | return intersection 184 | 185 | 186 | def all_element_int( lst): 187 | 188 | for element in lst: 189 | if isinstance(element, list): 190 | if not all_element_int(element): 191 | return False 192 | elif not isinstance(element, int): 193 | return False 194 | return True 195 | 196 | def catalyst_performance(gt_list): 197 | gt_v = [j for j in gt_list if '****' in j] 198 | result_list_ = [k for k in gt_v if 'ref' not in k] 199 | result_list__ = [k for k in result_list_ if 'loading' not in k] 200 | result_list_ref = [k for k in gt_v if 'ref' in k] 201 | result_list_loading = [k for k in gt_v if 'loading' in k] 202 | result_list = [] 203 | for item in result_list__ : 204 | a = item.split('//')[:-1] 205 | b = '//'.join(a) 206 | result_list.append(b) 207 | no_dupl = list(set(result_list)) 208 | return no_dupl, result_list_ref, result_list_loading 209 | 210 | def count_number(catalyst_list, first, second ) : 211 | count_ = [] 212 | for index, cata in enumerate(catalyst_list): 213 | if first in cata and second in cata[first] and cata[first][second]: 214 | count_.append(index) 215 | return count_ 216 | 217 | def making_new_dict(catalyst_list, first, second, count_lst): 218 | new_dict = {} 219 | new_value = [] 220 | 221 | if len(count_lst) > 1 : 222 | for value_index in count_lst : 223 | new_value.append(catalyst_list[value_index][first][second]) 224 | 225 | if any(isinstance(sublist, list) for sublist in new_value) : 226 | new_value = [item for sublist in new_value for item in (sublist if isinstance(sublist, list) else [sublist])] 227 | new_dict.setdefault(first, {})[second] = new_value 228 | 229 | else : 230 | new_dict = catalyst_list[count_lst[0]] 231 | return new_dict 232 | 233 | def dupl_catalyst(lst) : 234 | catalyst = [] 235 | for i in lst : 236 | if '//' not in i : 237 | catalyst.append(i) 238 | 239 | if len(catalyst) != len(list(set(catalyst))) : 240 | return True 241 | 242 | else : 243 | return False 244 | 245 | def merging_result(catalyst_list, lst): 246 | no_dupl, result_list_ref, result_list_loading = catalyst_performance(lst) 247 | new_result = [] 248 | for cata_perfo in no_dupl : 249 | f_s_lst = cata_perfo.split('//') 250 | 251 | if len(f_s_lst) >1 : 252 | first = f_s_lst[0] 253 | second = f_s_lst[1] 254 | count = count_number(catalyst_list, first, second) 255 | new_dict = making_new_dict(catalyst_list, first,second, count) 256 | new_result.append(new_dict) 257 | 258 | ref_dict = {} 259 | loading_dict = {} 260 | 261 | for ref in result_list_ref : 262 | 263 | if '//' in ref : 264 | ref_catalyst = ref.split('//')[0] 265 | reference = str(ref.split('****')[-1]) 266 | ref_dict.setdefault(ref_catalyst, {})['ref'] = reference 267 | if ref_dict not in new_result : 268 | new_result.append(ref_dict) 269 | 270 | else : 271 | reference = str(ref.split('****')[-1]) 272 | ref_dict["ref"] = reference 273 | new_result.append(ref_dict) 274 | 275 | for lo in result_list_loading : 276 | if '//' in lo : 277 | lo_catalyst = lo.split('//')[0] 278 | loading = str(lo.split('****')[-1]) 279 | loading_dict.setdefault(lo_catalyst, {})["loading"] = loading 280 | new_result.append(loading_dict) 281 | else : 282 | loading = str(lo.split('****')[-1]) 283 | loading_dict["loading"] = loading 284 | new_result.append(new_dloading_dictict) 285 | result_dict = {} 286 | for item in new_result: 287 | key = next(iter(item)) 288 | if key in result_dict: 289 | result_dict[key].update(item[key]) 290 | else: 291 | result_dict[key] = item[key] 292 | result_list = [{key: value} for key, value in result_dict.items()] 293 | return(result_list) 294 | 295 | def flatten_list(nested_list): 296 | flat_list = [] 297 | for element in nested_list: 298 | if isinstance(element, list): 299 | flat_list.extend(flatten_list(element)) 300 | else: 301 | flat_list.append(element) 302 | return flat_list 303 | -------------------------------------------------------------------------------- /model_script.json: -------------------------------------------------------------------------------- 1 | { 2 | "model" :{ 3 | "fine_tuning" : { 4 | "input_path" : "./split/tsv_representation/", 5 | "output_path" : "./data/result/fine_tuning", 6 | "fq" : { 7 | "path" : "./data/result/fine_tuning" 8 | } 9 | }, 10 | "few_shot" : { 11 | "input_path" : "./data/split/tsv_representation/", 12 | "output_path" : "./data/result/few_shot", 13 | "fq" : { 14 | "path" : "./data/result/few_shot" 15 | } 16 | }, 17 | "zero_shot" : { 18 | "input_path" : "./data/split/tsv_representation/", 19 | "output_path" : "./data/result/zero_shot", 20 | "fq" : { 21 | "path" : "./data/result/zero_shot" 22 | } 23 | } 24 | } 25 | } -------------------------------------------------------------------------------- /requirements_conda.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: win-64 4 | absl-py=0.14.1=pypi_0 5 | aiohttp=3.8.1=pypi_0 6 | aiosignal=1.2.0=pypi_0 7 | alabaster=0.7.12=pypi_0 8 | allennlp=0.9.0=pypi_0 9 | altair=4.2.2=pypi_0 10 | annotated-types=0.5.0=pypi_0 11 | anyio=3.7.1=pypi_0 12 | appdirs=1.4.4=pypi_0 13 | argcomplete=1.12.3=pypi_0 14 | argon2-cffi=21.1.0=pypi_0 15 | astunparse=1.6.3=pypi_0 16 | async-generator=1.10=pypi_0 17 | async-timeout=4.0.1=pypi_0 18 | asynctest=0.13.0=pypi_0 19 | atomicwrites=1.4.0=pypi_0 20 | attrs=22.2.0=pypi_0 21 | babel=2.9.1=pypi_0 22 | backcall=0.2.0=pypi_0 23 | backports-csv=1.0.7=pypi_0 24 | backports-zoneinfo=0.2.1=pypi_0 25 | base58=2.1.1=pypi_0 26 | beautifulsoup4=4.11.2=pypi_0 27 | behave=1.2.6=pypi_0 28 | bertopic=0.14.1=pypi_0 29 | blas=1.0=mkl 30 | bleach=4.1.0=pypi_0 31 | blinker=1.6=pypi_0 32 | blis=0.2.4=pypi_0 33 | boto3=1.19.2=pypi_0 34 | botocore=1.22.2=pypi_0 35 | bottleneck=1.3.2=py37h2a96729_1 36 | bpemb=0.3.3=pypi_0 37 | ca-certificates=2021.9.30=haa95532_1 38 | cached-path=0.3.2=pypi_0 39 | cached-property=1.5.2=pypi_0 40 | cachetools=4.2.4=pypi_0 41 | catalogue=2.0.6=pypi_0 42 | certifi=2022.12.7=pypi_0 43 | cffi=1.15.1=pypi_0 44 | chardet=5.1.0=pypi_0 45 | charset-normalizer=2.1.1=pypi_0 46 | checklist=0.0.11=pypi_0 47 | chemdataextractor=1.3.0=pypi_0 48 | cheroot=8.5.2=pypi_0 49 | cherrypy=18.6.1=pypi_0 50 | chromedriver-autoinstaller=0.4.0=pypi_0 51 | click=8.0.3=pypi_0 52 | cloudpickle=2.0.0=pypi_0 53 | colorama=0.4.6=pypi_0 54 | configparser=5.1.0=pypi_0 55 | conllu=4.0=pypi_0 56 | cryptography=39.0.1=pypi_0 57 | cssselect=1.2.0=pypi_0 58 | cycler=0.10.0=pypi_0 59 | cymem=2.0.6=pypi_0 60 | cython=0.29.14=pypi_0 61 | datasets=1.16.1=pypi_0 62 | dawg=0.8.0=pypi_0 63 | debugpy=1.5.1=pypi_0 64 | decorator=5.1.0=pypi_0 65 | deepdiff=6.5.0=pypi_0 66 | defusedxml=0.7.1=pypi_0 67 | deprecated=1.2.13=pypi_0 68 | dill=0.3.4=pypi_0 69 | distro=1.9.0=pypi_0 70 | docker-pycreds=0.4.0=pypi_0 71 | docutils=0.17.1=pypi_0 72 | editdistance=0.6.0=pypi_0 73 | entrypoints=0.3=pypi_0 74 | et-xmlfile=1.1.0=pypi_0 75 | exceptiongroup=1.1.0=pypi_0 76 | execnet=1.9.0=pypi_0 77 | fairscale=0.4.0=pypi_0 78 | fasteners=0.18=pypi_0 79 | feedparser=6.0.8=pypi_0 80 | filelock=3.9.0=pypi_0 81 | flair=0.9=pypi_0 82 | flaky=3.7.0=pypi_0 83 | flask=2.0.2=pypi_0 84 | flask-cors=3.0.10=pypi_0 85 | flatbuffers=1.12=pypi_0 86 | flatten-dict=0.4.2=pypi_0 87 | frozenlist=1.2.0=pypi_0 88 | fsspec=2021.11.1=pypi_0 89 | ftfy=6.0.3=pypi_0 90 | funcy=2.0=pypi_0 91 | future=0.18.2=pypi_0 92 | gast=0.3.3=pypi_0 93 | gdown=3.12.2=pypi_0 94 | gensim=3.8.3=pypi_0 95 | gevent=21.8.0=pypi_0 96 | gitdb=4.0.9=pypi_0 97 | gitpython=3.1.24=pypi_0 98 | glove-python-binary=0.2.0=pypi_0 99 | google-api-core=2.2.2=pypi_0 100 | google-auth=2.3.0=pypi_0 101 | google-auth-oauthlib=0.4.6=pypi_0 102 | google-cloud-core=2.2.1=pypi_0 103 | google-cloud-storage=1.43.0=pypi_0 104 | google-crc32c=1.3.0=pypi_0 105 | google-pasta=0.2.0=pypi_0 106 | google-resumable-media=2.1.0=pypi_0 107 | googleapis-common-protos=1.53.0=pypi_0 108 | greenlet=1.1.2=pypi_0 109 | grpcio=1.32.0=pypi_0 110 | h11=0.14.0=pypi_0 111 | h2o=3.40.0.4=pypi_0 112 | h5py=2.10.0=pypi_0 113 | hdbscan=0.8.29=pypi_0 114 | httpcore=0.17.3=pypi_0 115 | httpx=0.24.1=pypi_0 116 | huggingface-hub=0.13.3=pypi_0 117 | hyperopt=0.2.5=pypi_0 118 | idna=3.4=pypi_0 119 | imagesize=1.2.0=pypi_0 120 | importlib-metadata=6.6.0=pypi_0 121 | importlib-resources=5.4.0=pypi_0 122 | iniconfig=2.0.0=pypi_0 123 | install-jdk=1.0.4=pypi_0 124 | intel-openmp=2021.3.0=haa95532_3372 125 | ipykernel=6.5.1=pypi_0 126 | ipython=7.30.0=pypi_0 127 | ipython-genutils=0.2.0=pypi_0 128 | ipywidgets=7.6.5=pypi_0 129 | iso-639=0.4.5=pypi_0 130 | itsdangerous=2.0.1=pypi_0 131 | janome=0.4.1=pypi_0 132 | jaraco-classes=3.2.1=pypi_0 133 | jaraco-collections=3.4.0=pypi_0 134 | jaraco-functools=3.4.0=pypi_0 135 | jaraco-text=3.6.0=pypi_0 136 | jedi=0.18.1=pypi_0 137 | jinja2=3.0.2=pypi_0 138 | jmespath=0.10.0=pypi_0 139 | joblib=1.1.0=pypi_0 140 | jsonlines=3.1.0=pypi_0 141 | jsonpickle=2.0.0=pypi_0 142 | jsonschema=4.2.1=pypi_0 143 | jupyter=1.0.0=pypi_0 144 | jupyter-client=7.1.0=pypi_0 145 | jupyter-console=6.4.0=pypi_0 146 | jupyter-core=4.9.1=pypi_0 147 | jupyterlab-pygments=0.1.2=pypi_0 148 | jupyterlab-widgets=1.0.2=pypi_0 149 | keras-preprocessing=1.1.2=pypi_0 150 | keyring=23.9.3=pypi_0 151 | kiwisolver=1.3.2=pypi_0 152 | konoha=4.6.5=pypi_0 153 | langdetect=1.0.9=pypi_0 154 | llvmlite=0.39.1=pypi_0 155 | lmdb=1.2.1=pypi_0 156 | lxml=4.6.3=pypi_0 157 | markdown=3.3.4=pypi_0 158 | markdown-it-py=2.1.0=pypi_0 159 | markupsafe=2.0.1=pypi_0 160 | matplotlib=3.4.3=pypi_0 161 | matplotlib-inline=0.1.3=pypi_0 162 | mdurl=0.1.2=pypi_0 163 | mistune=0.8.4=pypi_0 164 | mkl=2021.3.0=haa95532_524 165 | mkl-service=2.4.0=py37h2bbff1b_0 166 | mkl_fft=1.3.0=py37h277e83a_2 167 | mkl_random=1.2.2=py37hf11a4ad_0 168 | mlxtend=0.23.1=pypi_0 169 | more-itertools=9.0.0=pypi_0 170 | mpld3=0.3=pypi_0 171 | multidict=5.2.0=pypi_0 172 | multiprocess=0.70.12.2=pypi_0 173 | munch=2.5.0=pypi_0 174 | murmurhash=1.0.6=pypi_0 175 | nbclient=0.5.9=pypi_0 176 | nbconvert=6.3.0=pypi_0 177 | nbformat=5.1.3=pypi_0 178 | nest-asyncio=1.5.1=pypi_0 179 | networkx=2.6.3=pypi_0 180 | nltk=3.6.5=pypi_0 181 | nose=1.3.7=pypi_0 182 | notebook=6.4.6=pypi_0 183 | numba=0.56.4=pypi_0 184 | numexpr=2.7.3=py37hb80d3ca_1 185 | numpy=1.19.5=pypi_0 186 | numpy-base=1.21.2=py37h0829f74_0 187 | numpydoc=1.1.0=pypi_0 188 | oauthlib=3.1.1=pypi_0 189 | openai=1.16.2=pypi_0 190 | openpyxl=3.0.9=pypi_0 191 | openssl=1.1.1l=h2bbff1b_0 192 | opt-einsum=3.3.0=pypi_0 193 | ordered-set=4.1.0=pypi_0 194 | outcome=1.2.0=pypi_0 195 | overrides=3.1.0=pypi_0 196 | packaging=23.0=pypi_0 197 | pandas=1.3.3=py37h6214cd6_0 198 | pandocfilters=1.5.0=pypi_0 199 | parameterized=0.8.1=pypi_0 200 | parse=1.19.0=pypi_0 201 | parse-type=0.6.0=pypi_0 202 | parsimonious=0.8.1=pypi_0 203 | parso=0.8.2=pypi_0 204 | pathtools=0.1.2=pypi_0 205 | pathy=0.6.1=pypi_0 206 | patternfork-nosql=3.6=pypi_0 207 | pdbp=1.2.8=pypi_0 208 | pdfminer=20191125=pypi_0 209 | pdfminer-six=20211012=pypi_0 210 | pickle5=0.0.12=pypi_0 211 | pickleshare=0.7.5=pypi_0 212 | pillow=8.4.0=pypi_0 213 | pip=24.0=pypi_0 214 | plac=0.9.6=pypi_0 215 | platformdirs=3.0.0=pypi_0 216 | plotly=5.14.1=pypi_0 217 | pluggy=1.0.0=pypi_0 218 | portend=3.1.0=pypi_0 219 | preshed=2.0.1=pypi_0 220 | prometheus-client=0.12.0=pypi_0 221 | promise=2.3=pypi_0 222 | prompt-toolkit=3.0.23=pypi_0 223 | protobuf=3.18.1=pypi_0 224 | psutil=5.8.0=pypi_0 225 | py=1.11.0=pypi_0 226 | pyarrow=6.0.1=pypi_0 227 | pyasn1=0.4.8=pypi_0 228 | pyasn1-modules=0.2.8=pypi_0 229 | pycparser=2.21=pypi_0 230 | pycryptodome=3.11.0=pypi_0 231 | pydantic=2.5.3=pypi_0 232 | pydantic-core=2.14.6=pypi_0 233 | pydeck=0.8.0=pypi_0 234 | pygments=2.14.0=pypi_0 235 | pyldavis=3.3.1=pypi_0 236 | pympler=1.0.1=pypi_0 237 | pynndescent=0.5.10=pypi_0 238 | pyopenssl=23.0.0=pypi_0 239 | pyotp=2.8.0=pypi_0 240 | pyparsing=2.4.7=pypi_0 241 | pyreadline3=3.4.1=pypi_0 242 | pyrsistent=0.18.0=pypi_0 243 | pysocks=1.7.1=pypi_0 244 | pytest=7.2.1=pypi_0 245 | pytest-forked=1.6.0=pypi_0 246 | pytest-html=2.0.1=pypi_0 247 | pytest-metadata=2.0.4=pypi_0 248 | pytest-ordering=0.6=pypi_0 249 | pytest-rerunfailures=11.1.1=pypi_0 250 | pytest-xdist=3.2.0=pypi_0 251 | python=3.7.11=h6244533_0 252 | python-crfsuite=0.9.7=pypi_0 253 | python-dateutil=2.8.2=pyhd3eb1b0_0 254 | python-docx=0.8.11=pypi_0 255 | python-dotenv=0.21.1=pypi_0 256 | pytorch-pretrained-bert=0.6.2=pypi_0 257 | pytorch-transformers=1.1.0=pypi_0 258 | pytz=2021.3=pyhd3eb1b0_0 259 | pytz-deprecation-shim=0.1.0.post0=pypi_0 260 | pywin32=302=pypi_0 261 | pywin32-ctypes=0.2.0=pypi_0 262 | pywinpty=1.1.6=pypi_0 263 | pyyaml=6.0=pypi_0 264 | pyzmq=22.3.0=pypi_0 265 | qtconsole=5.2.1=pypi_0 266 | qtpy=1.11.2=pypi_0 267 | regex=2021.10.8=pypi_0 268 | requests=2.28.2=pypi_0 269 | requests-oauthlib=1.3.0=pypi_0 270 | requests-toolbelt=0.10.1=pypi_0 271 | responses=0.14.0=pypi_0 272 | rich=13.3.1=pypi_0 273 | rsa=4.7.2=pypi_0 274 | s3transfer=0.5.0=pypi_0 275 | sacremoses=0.0.46=pypi_0 276 | sbvirtualdisplay=1.2.0=pypi_0 277 | schematics=2.1.1=pypi_0 278 | scikit-learn=1.0.2=pypi_0 279 | scipy=1.7.1=pypi_0 280 | segtok=1.5.10=pypi_0 281 | selenium=4.8.2=pypi_0 282 | seleniumbase=4.13.0=pypi_0 283 | semver=3.0.0=pypi_0 284 | send2trash=1.8.0=pypi_0 285 | sentence-transformers=2.2.2=pypi_0 286 | sentencepiece=0.1.95=pypi_0 287 | sentry-sdk=1.5.0=pypi_0 288 | seqeval=1.2.2=pypi_0 289 | setuptools=67.3.3=pypi_0 290 | sgmllib3k=1.0.0=pypi_0 291 | shortuuid=1.0.8=pypi_0 292 | simpletransformers=0.63.9=pypi_0 293 | six=1.15.0=pypi_0 294 | sklearn=0.0.post5=pypi_0 295 | smart-open=5.2.1=pypi_0 296 | smmap=5.0.0=pypi_0 297 | sniffio=1.3.0=pypi_0 298 | snowballstemmer=2.1.0=pypi_0 299 | sortedcontainers=2.4.0=pypi_0 300 | soupsieve=2.4=pypi_0 301 | spacy=2.1.9=pypi_0 302 | spacy-legacy=3.0.8=pypi_0 303 | sphinx=4.2.0=pypi_0 304 | sphinxcontrib-applehelp=1.0.2=pypi_0 305 | sphinxcontrib-devhelp=1.0.2=pypi_0 306 | sphinxcontrib-htmlhelp=2.0.0=pypi_0 307 | sphinxcontrib-jsmath=1.0.1=pypi_0 308 | sphinxcontrib-qthelp=1.0.3=pypi_0 309 | sphinxcontrib-serializinghtml=1.1.5=pypi_0 310 | sqlite=3.36.0=h2bbff1b_0 311 | sqlitedict=1.7.0=pypi_0 312 | sqlparse=0.4.2=pypi_0 313 | srsly=1.0.5=pypi_0 314 | streamlit=1.20.0=pypi_0 315 | subprocess32=3.5.4=pypi_0 316 | tabcompleter=1.1.0=pypi_0 317 | tabulate=0.8.9=pypi_0 318 | tempora=4.1.2=pypi_0 319 | tenacity=8.2.2=pypi_0 320 | tensorboard=2.7.0=pypi_0 321 | tensorboard-data-server=0.6.1=pypi_0 322 | tensorboard-plugin-wit=1.8.0=pypi_0 323 | tensorboardx=2.4=pypi_0 324 | tensorflow-estimator=2.4.0=pypi_0 325 | tensorflow-gpu=2.4.1=pypi_0 326 | termcolor=1.1.0=pypi_0 327 | terminado=0.12.1=pypi_0 328 | testpath=0.5.0=pypi_0 329 | thinc=7.0.8=pypi_0 330 | threadpoolctl=3.0.0=pypi_0 331 | tokenizers=0.13.2=pypi_0 332 | toml=0.10.2=pypi_0 333 | tomli=2.0.1=pypi_0 334 | toolz=0.12.0=pypi_0 335 | torch=1.7.1+cu110=pypi_0 336 | torchaudio=0.7.2=pypi_0 337 | torchcrf=1.1.0=pypi_0 338 | torchvision=0.8.2+cu110=pypi_0 339 | tornado=6.1=pypi_0 340 | tqdm=4.64.1=pypi_0 341 | traitlets=5.1.1=pypi_0 342 | transformers=4.27.4=pypi_0 343 | trio=0.22.0=pypi_0 344 | trio-websocket=0.9.2=pypi_0 345 | typer=0.4.0=pypi_0 346 | typing-extensions=4.7.1=pypi_0 347 | tzdata=2023.3=pypi_0 348 | tzlocal=4.3=pypi_0 349 | umap-learn=0.5.3=pypi_0 350 | unidecode=1.3.2=pypi_0 351 | urllib3=1.26.14=pypi_0 352 | validators=0.20.0=pypi_0 353 | vc=14.2=h21ff451_1 354 | vs2015_runtime=14.27.29016=h5e58377_2 355 | wandb=0.12.7=pypi_0 356 | wasabi=0.8.2=pypi_0 357 | watchdog=3.0.0=pypi_0 358 | wcwidth=0.2.5=pypi_0 359 | webdriver-manager=3.8.5=pypi_0 360 | webencodings=0.5.1=pypi_0 361 | websockets=10.4=pypi_0 362 | werkzeug=2.0.2=pypi_0 363 | wheel=0.38.4=pypi_0 364 | widgetsnbextension=3.5.2=pypi_0 365 | wikipedia-api=0.5.4=pypi_0 366 | wincertstore=0.2=py37haa95532_2 367 | word2number=1.1=pypi_0 368 | wrapt=1.12.1=pypi_0 369 | wsproto=1.2.0=pypi_0 370 | xxhash=2.0.2=pypi_0 371 | yarl=1.7.2=pypi_0 372 | yaspin=2.1.0=pypi_0 373 | zc-lockfile=2.0=pypi_0 374 | zipp=3.14.0=pypi_0 375 | zope-event=4.5.0=pypi_0 376 | zope-interface=5.4.0=pypi_0 377 | -------------------------------------------------------------------------------- /requirements_pip.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.14.1 2 | aiohttp==3.8.1 3 | aiosignal==1.2.0 4 | alabaster==0.7.12 5 | allennlp==0.9.0 6 | altair==4.2.2 7 | annotated-types==0.5.0 8 | anyio==3.7.1 9 | appdirs==1.4.4 10 | argcomplete==1.12.3 11 | argon2-cffi==21.1.0 12 | astunparse==1.6.3 13 | async-generator==1.10 14 | async-timeout==4.0.1 15 | asynctest==0.13.0 16 | atomicwrites==1.4.0 17 | attrs==22.2.0 18 | Babel==2.9.1 19 | backcall==0.2.0 20 | backports.csv==1.0.7 21 | backports.zoneinfo==0.2.1 22 | base58==2.1.1 23 | beautifulsoup4==4.11.2 24 | behave==1.2.6 25 | bertopic==0.14.1 26 | bleach==4.1.0 27 | blinker==1.6 28 | blis==0.2.4 29 | boto3==1.19.2 30 | botocore==1.22.2 31 | Bottleneck==1.3.2 32 | bpemb==0.3.3 33 | cached-path==0.3.2 34 | cached-property==1.5.2 35 | cachetools==4.2.4 36 | catalogue==2.0.6 37 | certifi==2022.12.7 38 | cffi==1.15.1 39 | chardet==5.1.0 40 | charset-normalizer==2.1.1 41 | checklist==0.0.11 42 | ChemDataExtractor==1.3.0 43 | cheroot==8.5.2 44 | CherryPy==18.6.1 45 | chromedriver-autoinstaller==0.4.0 46 | click==8.0.3 47 | cloudpickle==2.0.0 48 | colorama==0.4.6 49 | configparser==5.1.0 50 | conllu==4.0 51 | cryptography==39.0.1 52 | cssselect==1.2.0 53 | cycler==0.10.0 54 | cymem==2.0.6 55 | Cython==0.29.14 56 | datasets==1.16.1 57 | DAWG==0.8.0 58 | debugpy==1.5.1 59 | decorator==5.1.0 60 | deepdiff==6.5.0 61 | defusedxml==0.7.1 62 | Deprecated==1.2.13 63 | dill==0.3.4 64 | distro==1.9.0 65 | docker-pycreds==0.4.0 66 | docutils==0.17.1 67 | editdistance==0.6.0 68 | entrypoints==0.3 69 | et-xmlfile==1.1.0 70 | exceptiongroup==1.1.0 71 | execnet==1.9.0 72 | fairscale==0.4.0 73 | fasteners==0.18 74 | feedparser==6.0.8 75 | filelock==3.9.0 76 | flair==0.9 77 | flaky==3.7.0 78 | Flask==2.0.2 79 | Flask-Cors==3.0.10 80 | flatbuffers==1.12 81 | flatten-dict==0.4.2 82 | frozenlist==1.2.0 83 | fsspec==2021.11.1 84 | ftfy==6.0.3 85 | funcy==2.0 86 | future==0.18.2 87 | gast==0.3.3 88 | gdown==3.12.2 89 | gensim==3.8.3 90 | gevent==21.8.0 91 | gitdb==4.0.9 92 | GitPython==3.1.24 93 | glove-python-binary==0.2.0 94 | google-api-core==2.2.2 95 | google-auth==2.3.0 96 | google-auth-oauthlib==0.4.6 97 | google-cloud-core==2.2.1 98 | google-cloud-storage==1.43.0 99 | google-crc32c==1.3.0 100 | google-pasta==0.2.0 101 | google-resumable-media==2.1.0 102 | googleapis-common-protos==1.53.0 103 | greenlet==1.1.2 104 | grpcio==1.32.0 105 | h11==0.14.0 106 | h2o==3.40.0.4 107 | h5py==2.10.0 108 | hdbscan==0.8.29 109 | httpcore==0.17.3 110 | httpx==0.24.1 111 | huggingface-hub==0.13.3 112 | hyperopt==0.2.5 113 | idna==3.4 114 | imagesize==1.2.0 115 | importlib-metadata==6.6.0 116 | importlib-resources==5.4.0 117 | iniconfig==2.0.0 118 | install-jdk==1.0.4 119 | ipykernel==6.5.1 120 | ipython==7.30.0 121 | ipython-genutils==0.2.0 122 | ipywidgets==7.6.5 123 | iso-639==0.4.5 124 | itsdangerous==2.0.1 125 | Janome==0.4.1 126 | jaraco.classes==3.2.1 127 | jaraco.collections==3.4.0 128 | jaraco.functools==3.4.0 129 | jaraco.text==3.6.0 130 | jedi==0.18.1 131 | Jinja2==3.0.2 132 | jmespath==0.10.0 133 | joblib==1.1.0 134 | jsonlines==3.1.0 135 | jsonpickle==2.0.0 136 | jsonschema==4.2.1 137 | jupyter==1.0.0 138 | jupyter-client==7.1.0 139 | jupyter-console==6.4.0 140 | jupyter-core==4.9.1 141 | jupyterlab-pygments==0.1.2 142 | jupyterlab-widgets==1.0.2 143 | Keras-Preprocessing==1.1.2 144 | keyring==23.9.3 145 | kiwisolver==1.3.2 146 | konoha==4.6.5 147 | langdetect==1.0.9 148 | llvmlite==0.39.1 149 | lmdb==1.2.1 150 | lxml==4.6.3 151 | Markdown==3.3.4 152 | markdown-it-py==2.1.0 153 | MarkupSafe==2.0.1 154 | matplotlib==3.4.3 155 | matplotlib-inline==0.1.3 156 | mdurl==0.1.2 157 | mistune==0.8.4 158 | mkl-fft==1.3.0 159 | mkl-random @ file:///C:/ci/mkl_random_1626186163140/work 160 | mkl-service==2.4.0 161 | mlxtend==0.23.1 162 | more-itertools==9.0.0 163 | mpld3==0.3 164 | multidict==5.2.0 165 | multiprocess==0.70.12.2 166 | munch==2.5.0 167 | murmurhash==1.0.6 168 | nbclient==0.5.9 169 | nbconvert==6.3.0 170 | nbformat==5.1.3 171 | nest-asyncio==1.5.1 172 | networkx==2.6.3 173 | nltk==3.6.5 174 | nose==1.3.7 175 | notebook==6.4.6 176 | numba==0.56.4 177 | numexpr @ file:///C:/ci/numexpr_1618856761305/work 178 | numpy==1.19.5 179 | numpydoc==1.1.0 180 | oauthlib==3.1.1 181 | openai==1.16.2 182 | openpyxl==3.0.9 183 | opt-einsum==3.3.0 184 | ordered-set==4.1.0 185 | outcome==1.2.0 186 | overrides==3.1.0 187 | packaging==23.0 188 | pandas @ file:///C:/ci/pandas_1632920019983/work 189 | pandocfilters==1.5.0 190 | parameterized==0.8.1 191 | parse==1.19.0 192 | parse-type==0.6.0 193 | parsimonious==0.8.1 194 | parso==0.8.2 195 | pathtools==0.1.2 196 | pathy==0.6.1 197 | patternfork-nosql==3.6 198 | pdbp==1.2.8 199 | pdfminer==20191125 200 | pdfminer.six==20211012 201 | pickle5==0.0.12 202 | pickleshare==0.7.5 203 | Pillow==8.4.0 204 | plac==0.9.6 205 | platformdirs==3.0.0 206 | plotly==5.14.1 207 | pluggy==1.0.0 208 | portend==3.1.0 209 | preshed==2.0.1 210 | prometheus-client==0.12.0 211 | promise==2.3 212 | prompt-toolkit==3.0.23 213 | protobuf==3.18.1 214 | psutil==5.8.0 215 | py==1.11.0 216 | pyarrow==6.0.1 217 | pyasn1==0.4.8 218 | pyasn1-modules==0.2.8 219 | pycparser==2.21 220 | pycryptodome==3.11.0 221 | pydantic==2.5.3 222 | pydantic_core==2.14.6 223 | pydeck==0.8.0 224 | Pygments==2.14.0 225 | pyLDAvis==3.3.1 226 | Pympler==1.0.1 227 | pynndescent==0.5.10 228 | pyOpenSSL==23.0.0 229 | pyotp==2.8.0 230 | pyparsing==2.4.7 231 | pyreadline3==3.4.1 232 | pyrsistent==0.18.0 233 | PySocks==1.7.1 234 | pytest==7.2.1 235 | pytest-forked==1.6.0 236 | pytest-html==2.0.1 237 | pytest-metadata==2.0.4 238 | pytest-ordering==0.6 239 | pytest-rerunfailures==11.1.1 240 | pytest-xdist==3.2.0 241 | python-crfsuite==0.9.7 242 | python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work 243 | python-docx==0.8.11 244 | python-dotenv==0.21.1 245 | pytorch-pretrained-bert==0.6.2 246 | pytorch-transformers==1.1.0 247 | pytz==2021.3 248 | pytz-deprecation-shim==0.1.0.post0 249 | pywin32==302 250 | pywin32-ctypes==0.2.0 251 | pywinpty==1.1.6 252 | PyYAML==6.0 253 | pyzmq==22.3.0 254 | qtconsole==5.2.1 255 | QtPy==1.11.2 256 | regex==2021.10.8 257 | requests==2.28.2 258 | requests-oauthlib==1.3.0 259 | requests-toolbelt==0.10.1 260 | responses==0.14.0 261 | rich==13.3.1 262 | rsa==4.7.2 263 | s3transfer==0.5.0 264 | sacremoses==0.0.46 265 | sbvirtualdisplay==1.2.0 266 | schematics==2.1.1 267 | scikit-learn==1.0.2 268 | scipy==1.7.1 269 | segtok==1.5.10 270 | selenium==4.8.2 271 | seleniumbase==4.13.0 272 | semver==3.0.0 273 | Send2Trash==1.8.0 274 | sentence-transformers==2.2.2 275 | sentencepiece==0.1.95 276 | sentry-sdk==1.5.0 277 | seqeval==1.2.2 278 | sgmllib3k==1.0.0 279 | shortuuid==1.0.8 280 | simpletransformers==0.63.9 281 | six==1.15.0 282 | sklearn==0.0.post5 283 | smart-open==5.2.1 284 | smmap==5.0.0 285 | sniffio==1.3.0 286 | snowballstemmer==2.1.0 287 | sortedcontainers==2.4.0 288 | soupsieve==2.4 289 | spacy==2.1.9 290 | spacy-legacy==3.0.8 291 | Sphinx==4.2.0 292 | sphinxcontrib-applehelp==1.0.2 293 | sphinxcontrib-devhelp==1.0.2 294 | sphinxcontrib-htmlhelp==2.0.0 295 | sphinxcontrib-jsmath==1.0.1 296 | sphinxcontrib-qthelp==1.0.3 297 | sphinxcontrib-serializinghtml==1.1.5 298 | sqlitedict==1.7.0 299 | sqlparse==0.4.2 300 | srsly==1.0.5 301 | streamlit==1.20.0 302 | subprocess32==3.5.4 303 | tabcompleter==1.1.0 304 | tabulate==0.8.9 305 | tempora==4.1.2 306 | tenacity==8.2.2 307 | tensorboard==2.7.0 308 | tensorboard-data-server==0.6.1 309 | tensorboard-plugin-wit==1.8.0 310 | tensorboardX==2.4 311 | tensorflow-estimator==2.4.0 312 | tensorflow-gpu==2.4.1 313 | termcolor==1.1.0 314 | terminado==0.12.1 315 | testpath==0.5.0 316 | thinc==7.0.8 317 | threadpoolctl==3.0.0 318 | tokenizers==0.13.2 319 | toml==0.10.2 320 | tomli==2.0.1 321 | toolz==0.12.0 322 | torch==1.7.1+cu110 323 | torchaudio==0.7.2 324 | TorchCRF==1.1.0 325 | torchvision==0.8.2+cu110 326 | tornado==6.1 327 | tqdm==4.64.1 328 | traitlets==5.1.1 329 | transformers==4.27.4 330 | trio==0.22.0 331 | trio-websocket==0.9.2 332 | typer==0.4.0 333 | typing_extensions==4.7.1 334 | tzdata==2023.3 335 | tzlocal==4.3 336 | umap-learn==0.5.3 337 | Unidecode==1.3.2 338 | urllib3==1.26.14 339 | validators==0.20.0 340 | wandb==0.12.7 341 | wasabi==0.8.2 342 | watchdog==3.0.0 343 | wcwidth==0.2.5 344 | webdriver-manager==3.8.5 345 | webencodings==0.5.1 346 | websockets==10.4 347 | Werkzeug==2.0.2 348 | widgetsnbextension==3.5.2 349 | Wikipedia-API==0.5.4 350 | wincertstore==0.2 351 | word2number==1.1 352 | wrapt==1.12.1 353 | wsproto==1.2.0 354 | xxhash==2.0.2 355 | yarl==1.7.2 356 | yaspin==2.1.0 357 | zc.lockfile==2.0 358 | zipp==3.14.0 359 | zope.event==4.5.0 360 | zope.interface==5.4.0 361 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import json 2 | from table_splitting.split_table import * 3 | from table_representation.table_representer import TableRepresenter 4 | from table_representation.table2json import TableProcessor 5 | from GPT_models import models 6 | from GPT_models.follow_up_q import * 7 | 8 | 9 | def input_generation(splitting, table_representation) : 10 | ''' 11 | Generates input file 12 | 13 | Parameters 14 | splitting : "split" or "non_split" 15 | table_representation : "TSV" or "JSON" 16 | 17 | returns : inpur for model test 18 | ''' 19 | 20 | with open('Z:/NLP Project/table/code_upload/input_generation_script.json', 'r', encoding='utf-8') as file: 21 | data = json.load(file) # JSON 데이터를 파이썬 객체로 변환 22 | 23 | if splitting == "non_split" : 24 | if table_representation == "JSON" : 25 | input_path = data["input_generator"]["splitting_HTML"]["table_representation"]["JSON"]["input_path"] 26 | output_path = data["input_generator"]["splitting_HTML"]["table_representation"]["JSON"]["output_path"] 27 | 28 | json_file_list = os.listdir(input_path) 29 | 30 | for i in json_file_list: 31 | a = i.split('.')[0] 32 | table_processor = TableProcessor(input_path + a + '.json') 33 | table_processor.convert_to_json(i, output_path) 34 | 35 | if table_representation == "TSV" : 36 | input_path = data["input_generator"]["splitting_HTML"]["table_representation"]["TSV"]["input_path"] 37 | output_path = data["input_generator"]["splitting_HTML"]["table_representation"]["TSV"]["output_path"] 38 | 39 | table_list = os.listdir(input_path) 40 | table = TableRepresenter(table_path) 41 | 42 | for table_element in table_list: 43 | table.run(table_element, output_path) 44 | 45 | elif splitting == "split" : 46 | input_json_path = data["input_generator"]["splitting_HTML"]["split"][0]["input_JSON_path"] 47 | input_pickle_path = data["input_generator"]["splitting_HTML"]["split"][0]["input_pickle_path"] 48 | input_HTML_path = data["input_generator"]["splitting_HTML"]["split"][0]["input_HTML_path"] 49 | output_HTML_path = data["input_generator"]["splitting_HTML"]["split"][0]["output_HTML_path"] 50 | 51 | body_list = TablePaser(input_json_path, input_HTML_path, input_pickle_path) 52 | body_list.run() 53 | table_spliter = DivideHtml(input_HTML_path + '/example_tbl01.html', input_pickle_path + '/example_tbl01.pickle', output_HTML_path) 54 | table_spliter.run() 55 | 56 | if table_representation == "JSON" : 57 | input_path = data["input_generator"]["splitting_HTML"][1]["table_representation"]["JSON"]["input_path"] 58 | output_path = data["input_generator"]["splitting_HTML"][1]["table_representation"]["JSON"]["output_path"] 59 | 60 | json_file_list = os.listdir(input_path) 61 | 62 | for i in json_file_list: 63 | a = i.split('.')[0] 64 | table_processor = TableProcessor(input_path + a + '.json') 65 | table_processor.convert_to_json(i, output_path) 66 | 67 | if table_representation == "TSV" : 68 | input_path = data["input_generator"]["splitting_HTML"][1]["table_representation"]["TSV"]["input_path"] 69 | output_path = data["input_generator"]["splitting_HTML"][1]["table_representation"]["TSV"]["output_path"] 70 | 71 | table_list = os.listdir(input_path) 72 | table = TableRepresenter(table_path) 73 | 74 | for table_element in table_list: 75 | table.run(table_element, output_path) 76 | 77 | 78 | def model_test(model_, fq): 79 | ''' 80 | Generates the prediction files in json format 81 | 82 | Parameters 83 | model_ : "few_shot" or "zero_shot" or "fine_tuning" 84 | fq : True or False 85 | 86 | returns : data extraction result 87 | ''' 88 | with open('Z:/NLP Project/table/code_upload/model_script.json', 'r', encoding='utf-8') as file: 89 | data = json.load(file) 90 | 91 | input_path = data["model"][model_]["input_path"] 92 | output_path = data["model"][model_]["output_path"] 93 | 94 | if fq == False : 95 | if model_ == "few_shot": 96 | result = few_shot(input_path, output_path ) 97 | 98 | elif model_ == "zero_shot": 99 | result = zero_shot(input_path, output_path) 100 | 101 | elif model_ == "fine_tuning": 102 | result = fine_tuning(input_path, output_path) 103 | 104 | else: 105 | print("Unknown model type") 106 | 107 | elif fq == True : 108 | 109 | if model_ == "few_shot": 110 | few_shot(input_path, output_path ) 111 | assistant = FollowQ(output_path, input_path, output_path) 112 | assistant.run() 113 | elif model_ == "zero_shot": 114 | zero_shot(input_path, output_path) 115 | assistant = FollowQ(output_path, input_path, output_path) 116 | assistant.run() 117 | 118 | elif model_ == "fine_tuning": 119 | fine_tuning(input_path, output_path) 120 | assistant = FollowQ(output_path, input_path, output_path) 121 | assistant.run() 122 | 123 | -------------------------------------------------------------------------------- /table_representation/__pycache__/table2json_upload.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/table_representation/__pycache__/table2json_upload.cpython-37.pyc -------------------------------------------------------------------------------- /table_representation/__pycache__/table_representer_upload.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/table_representation/__pycache__/table_representer_upload.cpython-37.pyc -------------------------------------------------------------------------------- /table_representation/table2json.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import pandas as pd 3 | import json 4 | import os 5 | from bs4 import NavigableString 6 | from collections import OrderedDict 7 | 8 | 9 | class TableProcessor : 10 | def __init__(self, json_path) : 11 | self.json_path = json_path 12 | 13 | def load_table(self): 14 | ''' 15 | load the table in format 16 | ''' 17 | 18 | with open(self.json_path, 'r', encoding = 'utf-8') as file: 19 | data = json.load(file) 20 | 21 | title = data['title'] 22 | caption = data['caption'] 23 | table_tag = data["tag"] 24 | soup = BeautifulSoup(table_tag, 'html.parser') 25 | return soup, title, caption 26 | 27 | def caption_process(self): 28 | ''' 29 | finding caption and ref data in html table, 30 | giving caption data tag, 31 | giving reference data tag 32 | ''' 33 | 34 | table, _, _ = self.load_table() 35 | 36 | for tfoot in table.find_all('tfoot'): 37 | tfoot.decompose() 38 | 39 | td_elements = table.find_all('td') 40 | th_elements = table.find_all('th') 41 | 42 | for th in th_elements: 43 | link = th.find('a') 44 | if link: 45 | link_text = link.get_text() 46 | if len(link_text) == 1 and link_text.isalpha() or link_text == '*': 47 | link.string = "" + link_text + "" 48 | elif len(link_text) == 1 and link_text == '*': 49 | link.string = "" + link_text + "" 50 | else : 51 | link.string = "" + link_text + "" 52 | 53 | for td in td_elements: 54 | link = td.find_all('a') 55 | 56 | if len(link) == 1: 57 | linktext = link[0] 58 | link_text = linktext.get_text() 59 | if len(link_text) == 1 and link_text.isalpha(): 60 | link[0].string = "" + link_text + "" 61 | elif len(link_text) == 1 and link_text == '*': 62 | link[0].string = "" + link_text + "" 63 | else : 64 | link[0].string = "" + link_text + "" 65 | 66 | elif len(link) > 1 : 67 | link_string = [] 68 | for i in link : 69 | link_str = i.get_text() 70 | link_string.append(link_str) 71 | combined_string = ','.join(link_string) 72 | link[0].string = "" + combined_string + "" 73 | for j in (1, len(link)-1) : 74 | link[j].string = '' 75 | 76 | return table 77 | 78 | def supb_process(self): 79 | ''' 80 | finding sub and sup in html table, 81 | giving sub tag, 82 | giving sup tag 83 | ''' 84 | 85 | table = self.caption_process() 86 | for i_tag in table.find_all('i'): 87 | i_tag.unwrap() 88 | td_elements = table.find_all('td') 89 | th_elements = table.find_all('th') 90 | for th in th_elements: 91 | sup_ = th.find_all('sup') 92 | sub_ = th.find_all('sub') 93 | if sup_: 94 | for q in sup_: 95 | sup_text = q.get_text() if q.get_text() else "" 96 | q.string = "" + sup_text + "" 97 | if sub_: 98 | for e in sub_ : 99 | sub_text = e.get_text() if e.get_text() else "" 100 | e.string = "" + sub_text + "" 101 | for td in td_elements: 102 | sup = td.find_all('sup') 103 | sub = td.find_all('sub') 104 | if sup: 105 | for b in sup : 106 | sup_text = b.get_text() if b.get_text() else "" 107 | b.string = "" + sup_text + "" 108 | if sub: 109 | for a in sub: 110 | sub_text = a.get_text() if a.get_text() else "" 111 | a.string = "" + sub_text + "" 112 | 113 | return table 114 | 115 | def header_process(self): 116 | ''' 117 | Filling empty cells in the header with '-'. 118 | ''' 119 | 120 | table = self.supb_process() 121 | th_elements = table.find_all('th') 122 | for th in th_elements : 123 | if not th.text.strip() : 124 | th.insert(0, '-') 125 | th['align'] = 'left' 126 | 127 | return table 128 | 129 | def body_process(self): 130 | ''' 131 | Copy the first cell of the previous row if the first cell is empty. 132 | ''' 133 | 134 | table = self.header_process() 135 | has_empty_cells = False 136 | prev_value = None 137 | for row in table.find_all('tr'): 138 | first_cell = row.find('td') 139 | if first_cell: 140 | cell_text = first_cell.text.strip() 141 | if cell_text == '' and prev_value: 142 | first_cell.string = prev_value 143 | if cell_text == '': 144 | has_empty_cells = True 145 | prev_value = cell_text 146 | 147 | return table 148 | 149 | def convert_to_dataframe(self): 150 | ''' 151 | Conveert the html table to dataframe 152 | ''' 153 | 154 | table = self.body_process() 155 | dfs = pd.read_html(str(table)) 156 | df_table = dfs[0] 157 | df_table.fillna("NaN", inplace=True) 158 | 159 | return df_table 160 | 161 | def convert_to_json(self, table_name, save_directory): 162 | ''' 163 | Convert dataframe to json. 164 | ''' 165 | _, title, caption = self.load_table() 166 | table_name = table_name.split('.')[0] 167 | name_element = table_name.split('_') 168 | df_for_json = self.convert_to_dataframe() 169 | header_row = df_for_json.columns.nlevels 170 | df_for_json_key = list(df_for_json.columns) 171 | num_columns = df_for_json.shape[1] 172 | 173 | key_list = [] 174 | value_list = [] 175 | for i in range(0, num_columns): 176 | key_list.append(df_for_json_key[i]) 177 | value_list.append(df_for_json.iloc[:, i].tolist()) 178 | 179 | result = {} 180 | if header_row > 1: 181 | for i, keys in enumerate(key_list): 182 | current_dict = result 183 | for j, key in enumerate(keys): 184 | if key not in current_dict: 185 | current_dict[key] = {} 186 | if j == len(keys) - 1: 187 | current_dict[key] = value_list[i] 188 | current_dict = current_dict[key] 189 | elif header_row == 1 : 190 | for i, keys in enumerate(key_list): 191 | current_dict = result 192 | 193 | current_dict[keys] = value_list[i] 194 | 195 | # try : 196 | key_to_extract = caption 197 | title_to_extract = title 198 | key_to_extract = { 199 | 200 | "caption": key_to_extract 201 | } 202 | title_to_extract = { 203 | 204 | "Title": title_to_extract 205 | } 206 | 207 | result.update(key_to_extract) 208 | title_to_extract.update(result) 209 | save_directory_ = save_directory + '/' + table_name + '.json' 210 | 211 | with open(save_directory_, 'w', encoding='utf-8') as f: 212 | json.dump(title_to_extract, f, indent=4) 213 | 214 | 215 | # if __name__ == '__main__': 216 | # json_path = 'Z:/NLP Project/table/code_upload/data/split/table_split_json' 217 | # json_file_list = os.listdir(json_path) 218 | # save_directory = 'Z:/NLP Project/table/code_upload/data/split/json_representation' 219 | 220 | 221 | # for i in json_file_list: 222 | # a = i.split('.')[0] 223 | # table_processor = TableProcessor(json_path + a + '.json') 224 | # table_processor.convert_to_json(i, save_directory) 225 | 226 | 227 | -------------------------------------------------------------------------------- /table_representation/table_representer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from bs4 import BeautifulSoup 4 | import re 5 | 6 | class TableRepresenter: 7 | def __init__(self, table_path): 8 | self.table_path = table_path 9 | self.table_list = os.listdir(self.table_path) 10 | 11 | # Initialize cell representation strings 12 | self.merged_cell = '{}' 13 | self.both_merged_cell = '{}' 14 | self.cell = '{}\\t' 15 | self.line_breaking = '\\n' 16 | self.table_tag = '{}
' 17 | self.caption_tag = '{}' 18 | self.title_tag = '{}' 19 | 20 | def text_filter(self, out): 21 | """ 22 | Remove unnecessary text and HTML tags from the given string. 23 | """ 24 | out = re.sub('\\xa0', ' ', out) 25 | out = re.sub('\\u2005', ' ', out) 26 | out = re.sub('\\u2009', ' ', out) 27 | out = re.sub('\\u202f', ' ', out) 28 | out = re.sub('\\u200b', '', out) 29 | out = re.sub('', '', out) 30 | out = re.sub('', '', out) 31 | 32 | # Remove or replace specific patterns 33 | patterns = [ 34 | (r'(\(\d+\)|\d+|\[\d+\]|\d+\,\d+|\d+\,\d+\,\d+|\d+\,\d+\–\d+|\d+\D+|\(\d+\,\s*\d+\)|\(\d+\D+\))', r'\1'), 35 | (r'(\s*ref\.\s\d+.*?)', r'\1'), 36 | (r'\((\s*(ref\.\s\d+.*?)\s*)\)', r'\1'), 37 | (r'(\s*Ref\.\s\d+.*?)', r'\1'), 38 | (r'\((\s*(Ref\.\s\d+.*?)\s*)\)', r'\1'), 39 | (r'(\[\d+|\d+\])', r'\1'), 40 | (r'((.*?)et al\..*?)', r'\1'), 41 | (r'((.*?)Fig\..*?)', r'\1'), 42 | (r'(Song and Hu \(2014\))', r'\1'), 43 | (r'
', '',), 44 | (r'(mA\.cm)', r'\1'), 45 | (r'(https.*?)', r'\1'), 46 | (r'(\d+\.\d+\@\d+)', r'\1') 47 | ] 48 | 49 | for pattern, repl in patterns: 50 | out = re.sub(pattern, repl, out) 51 | 52 | return out 53 | 54 | def caption_process(self, caption): 55 | """ 56 | Process the caption text and extract key-value pairs. 57 | """ 58 | pattern = r'(\w+): (.*?)(?:;|$)' 59 | matches = re.findall(pattern, caption) 60 | result_dict = {key.strip(): value.strip() for key, value in matches} 61 | print(result_dict) 62 | 63 | def load_data(self, file_name): 64 | """ 65 | Load JSON data from the specified file. 66 | """ 67 | file_path = os.path.join(self.table_path, file_name) 68 | with open(file_path, 'r', encoding='utf-8-sig') as file: 69 | data = json.load(file) 70 | return data 71 | 72 | def process_table(self, t): 73 | """ 74 | Remove unnecessary HTML tags from the table element. 75 | """ 76 | tags_to_remove = ['img', 'em', 'i', 'p', 'span', 'strong', 'math', 'mi', 'br', 'script', 'svg', 'mrow', 'mo', 'mn', 'msub', 'msubsup', 'mtext', 'mjx-container', 'mjx-math', 'mjx-mrow', 'mjx-msub', 'mjx-mi', 'mjx-c', 'mjx-script', 'mjx-mspace', 'mjx-assistive-mml', 'mspace'] 77 | 78 | for tag in tags_to_remove: 79 | elements = t.find_all(tag) 80 | for element in elements: 81 | if tag in ['img', 'script', 'svg']: 82 | element.decompose() 83 | else: 84 | element.unwrap() 85 | 86 | return t 87 | 88 | def make_table_representer(self, table_representer, table_element, head=None): 89 | """ 90 | Create a table representation with the appropriate formatting. 91 | """ 92 | out = [['' for _ in range(self.width)] for _ in range(self.height if head is None else len(table_element.find_all('tr')))] 93 | 94 | i = 0 95 | for tr in table_element.find_all('tr'): 96 | j = 0 97 | for t in tr.find_all(re.compile('(? and tags from the table data. 163 | """ 164 | result = [[item.replace('', '').replace('', '') for item in inner_list] for inner_list in data] 165 | result = [[item.replace('', '').replace('', '') for item in inner_list] for inner_list in result] 166 | return result 167 | 168 | def run(self, table, save_directory): 169 | cap_table_list = [] 170 | 171 | final_table_representer = {} 172 | print(table) 173 | 174 | data = self.load_data(table) 175 | table_tag = data["tag"] 176 | soup = BeautifulSoup(table_tag, 'html.parser') 177 | thead = soup.find('thead') 178 | tbody = soup.find('tbody') 179 | 180 | self.width = sum(int(t.get('colspan', 1)) for t in soup.find('tbody').find('tr').find_all(re.compile('(?\1', rows[0]) 213 | table_list[i] = rows 214 | 215 | result = '' 216 | 217 | for table_row in table_list: 218 | for element in table_row: 219 | if element == '::': 220 | pass 221 | else: 222 | result += self.cell.format(element) 223 | result += self.line_breaking 224 | 225 | final_result = self.table_tag.format(result) 226 | 227 | caption = data['caption'] 228 | title = data['title'] 229 | 230 | final_result = self.title_tag.format(title) + final_result 231 | 232 | for table_row in table_list: 233 | for element in table_row: 234 | if "" in element: 235 | cap_table_list.append(table) 236 | 237 | cap_table_list = list(set(cap_table_list)) 238 | 239 | if caption: 240 | 241 | final_result += '\n' 242 | if isinstance(caption, dict): 243 | caption_str = ', '.join([f"{key}: {value}" for key, value in caption.items()]) 244 | final_result += self.caption_tag.format(caption_str) 245 | else: 246 | final_result += self.caption_tag.format(caption) 247 | 248 | save_path = os.path.join(save_directory, table[:-5]+'.txt') 249 | with open (save_path, 'a', encoding='utf-8-sig') as f: 250 | f.write(final_result) 251 | 252 | # if __name__ == "__main__": 253 | # table_path = 'example_json folder path' 254 | # save_directory = 'Z:/NLP Project/table/code_upload/data/split/tsv_representation' 255 | # table_path = 'Z:/NLP Project/table/code_upload/data/split/table_split_json/' 256 | # table_list = os.listdir(table_path) 257 | # table = TableRepresenter(table_path) 258 | 259 | # for table_element in table_list: 260 | # print(table_element) 261 | # table.run(table_element, save_directory) 262 | -------------------------------------------------------------------------------- /table_splitting/__pycache__/split_table_.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/table_splitting/__pycache__/split_table_.cpython-37.pyc -------------------------------------------------------------------------------- /table_splitting/split_table.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import json 3 | import os 4 | from bs4 import NavigableString 5 | import copy 6 | import pickle 7 | import nltk 8 | import re 9 | 10 | class TablePaser: 11 | def __init__(self, json_path, table_path, pickle_path): 12 | self.json_path = json_path 13 | self.table_path = table_path 14 | self.pickle_path = pickle_path 15 | self.table_list = os.listdir(self.table_path) 16 | self.merged_cell = '{}' 17 | self.both_merged_cell = '{}' 18 | self.cell = '{}\\t' 19 | self.line_breaking = '\\n' 20 | self.table_tag = '{}
' 21 | self.caption_tag = '{}' 22 | self.title_tag = '{}' 23 | 24 | def text_filter(self, out): 25 | """ 26 | Remove unnecessary text and HTML tags from the given string. 27 | """ 28 | out = re.sub('\\xa0', ' ', out) 29 | out = re.sub('\\u2005', ' ', out) 30 | out = re.sub('\\u2009', ' ', out) 31 | out = re.sub('\\u202f', ' ', out) 32 | out = re.sub('\\u200b', '', out) 33 | out = re.sub('', '', out) 34 | out = re.sub('', '', out) 35 | out = re.sub(r'(\(\d+\)|\d+|\[\d+\]|\d+\,\d+|\d+\,\d+\,\d+|\d+\,\d+\–\d+|\d+\D+|\(\d+\,\s*\d+\)|\(\d+\D+\))', r'\1', out) 36 | out = re.sub(r'(\s*ref\.\s\d+.*?)', r'\1', out) 37 | out = re.sub(r'\((\s*(ref\.\s\d+.*?)\s*)\)', r'\1', out) 38 | out = re.sub(r'(\s*Ref\.\s\d+.*?)', r'\1', out) 39 | out = re.sub(r'\((\s*(Ref\.\s\d+.*?)\s*)\)', r'\1', out) 40 | out = re.sub(r'(\[\d+|\d+\])', r'\1', out) 41 | out = re.sub(r'((.*?)et al\..*?)', r'\1', out) 42 | out = re.sub(r'((.*?)Fig\..*?)', r'\1', out) 43 | out = re.sub(r'(Song and Hu \(2014\))', r'\1', out) 44 | out = re.sub(r'
', '', out) 45 | out = re.sub(r'(mA\.cm)', r'\1', out) 46 | out = re.sub(r'(https.*?)', r'\1', out) 47 | out = re.sub(r'(\d+\.\d+\@\d+)', r'\1', out) 48 | out = re.sub(r'\[(\d+)\]','['+r'\1'+']', out) 49 | return out 50 | 51 | def metadata(self, file_name): 52 | file_name_parts = file_name.split('.')[0] 53 | json_file_name = file_name_parts + '.json' 54 | file_path = os.path.join(self.json_path, json_file_name) 55 | 56 | with open(file_path, 'r', encoding='utf-8') as file: 57 | metadata = json.load(file) 58 | 59 | return file_name, metadata 60 | 61 | def process_table(self, t): 62 | """ 63 | Remove unnecessary HTML tags from the table element. 64 | """ 65 | for tag in ['img', 'em', 'i', 'p', 'span', 'strong', 'math', 'mi', 'script', 'svg', 'mrow', 'mo', 'mn', 'br', 'msub', 'msubsup', 'mtext', 'mjx-container', 'mjx-math', 'mjx-mrow', 'mjx-msub', 'mjx-mi', 'mjx-c', 'mjx-script', 'mjx-mspace', 'mjx-assistive-mml', 'mspace']: 66 | if t.find(tag): 67 | if tag == 'em' or tag == 'i' or tag == 'p' or tag == 'span' or tag == 'strong' or tag == 'mi' or tag == 'mrow'or tag == 'mo'or tag == 'mn' or tag == 'br' or tag =='msub' or tag == 'msubsup' or tag =='mtext'or tag =='mjx-container' or tag == 'mjx-math' or tag =='mjx-mrow' or tag=='mjx-msub' or tag =='mjx-mi' or tag == 'mjx-c' or tag=='mjx-script' or tag =='mjx-mspace' or tag=='mjx-assistive-mml' or tag=='mspace': 68 | for strong_tag in t.find_all(tag): 69 | strong_tag.unwrap() 70 | elif tag == 'img' or tag == 'script' or tag == 'svg': 71 | img_tag = t.find(tag) 72 | img_tag.decompose() 73 | return t 74 | 75 | def make_table_representer(self, table_representer, table_element, head=None): 76 | """ 77 | Create a table representation with the appropriate formatting. 78 | """ 79 | out = [] 80 | if head == True: 81 | head_height = len(table_element.find_all('tr')) 82 | for i in range(head_height): 83 | out.append([]) 84 | for j in range(self.width): 85 | out[i].append('') 86 | elif head == False: 87 | body_height = len(table_element.find_all('tr')) 88 | for i in range(body_height): 89 | out.append([]) 90 | for j in range(self.width): 91 | out[i].append('') 92 | else: 93 | for i in range(self.height): 94 | out.append([]) 95 | for j in range(self.width): 96 | out[i].append('') 97 | i = 0 98 | 99 | for tr in table_element.find_all('tr'): 100 | j = 0 101 | 102 | for t in tr.find_all(re.compile('(? and tags from the table data. 185 | """ 186 | result = [[item.replace('', '').replace('', '') for item in inner_list] for inner_list in data] 187 | result = [[item.replace('', '').replace('', '') for item in inner_list] for inner_list in result] 188 | return result 189 | 190 | def run(self): 191 | cap_table_list = [] 192 | cap_table_dict = {} 193 | for table in self.table_list: 194 | final_table_representer = {} 195 | 196 | file_name, metadata = self.metadata(table) 197 | table_title = metadata.get('title', '') 198 | table_caption = metadata.get('caption', '') 199 | table_tag = metadata.get('tag', '') 200 | soup = BeautifulSoup(table_tag, 'html.parser') 201 | thead = soup.find('thead') if soup.find('thead') else None 202 | tbody = soup.find('tbody') if soup.find('tbody') else None 203 | 204 | if len(soup.find_all()) == 0: 205 | continue 206 | self.width = 0 207 | for t in soup.find('tbody').find('tr').find_all(re.compile('(?1 : 307 | row_h_b_list.append('head') 308 | 309 | else : pass 310 | 311 | else : 312 | td_list = [] 313 | for td in row : 314 | if td.lower() == 'empty cell' : 315 | td = td.replace('empty cell', ' ') 316 | 317 | if td == '-' : 318 | td = td.replace('-', '0') 319 | if td == '—' : 320 | td = td.replace('—', '0') 321 | if td == '–' : 322 | td = td.replace('–', '0') 323 | if td == '--' : 324 | td = td.replace('--', '0') 325 | if td == '---' : 326 | td = td.replace('---', '0') 327 | if td == '----' : 328 | td = td.replace('----', '0') 329 | if td == '' : 330 | td = td.replace('', '0') 331 | if td == ' ' : 332 | td = td.replace(' ', '0') 333 | if 'work' in td : 334 | td = '0 ' + td 335 | if 'et al' in td : 336 | td = '0 ' + td 337 | 338 | tokens = nltk.word_tokenize(td) 339 | for t in tokens : 340 | t.strip() 341 | 342 | if 'ref' in tokens : 343 | tokens.remove('ref') 344 | if 'Ref' in tokens : 345 | tokens.remove('Ref') 346 | if '' in tokens : 347 | tokens.remove('') 348 | if '' in tokens : 349 | tokens.remove('') 350 | if 'mV' in tokens : 351 | tokens.remove('mV') 352 | if 'V' in tokens : 353 | tokens.remove('V') 354 | if '%' in tokens : 355 | tokens.remove('%') 356 | if 'sup' in tokens : 357 | tokens.remove('sup') 358 | if '/sup' in tokens : 359 | tokens.remove('/sup') 360 | if 'sub' in tokens : 361 | tokens.remove('sub') 362 | if '/sub' in tokens : 363 | tokens.remove('/sub') 364 | 365 | cleaned_tokens = [] 366 | final_tokens = [] 367 | 368 | for item in tokens: 369 | cleaned_item = ''.join(e for e in item if e.isalnum() or e.isspace()) 370 | cleaned_tokens.append(cleaned_item) 371 | 372 | for i in cleaned_tokens : 373 | if i != '' : 374 | final_tokens.append(i) 375 | 376 | if final_tokens == [] : 377 | pass 378 | 379 | else : 380 | try : 381 | token_string = final_tokens[0] 382 | token_lll = token_string.split() 383 | 384 | float(token_lll[0]) 385 | td_list.append('b') 386 | 387 | except : 388 | td_list.append('h') 389 | 390 | if 'b' in td_list : 391 | row_h_b_list.append('body') 392 | 393 | else : 394 | row_h_b_list.append('head') 395 | row_h_b_list[-1] = 'body' 396 | return row_h_b_list 397 | 398 | def split_list_by_indexes(self, body, index_list): 399 | ''' 400 | Split the list by the header in the double-header to create N lists. 401 | ''' 402 | result = [] 403 | start = 0 404 | for index in index_list: 405 | result.append(body[start:index]) 406 | start = index 407 | result.append(body[start:]) 408 | return result 409 | 410 | def case_1(self, origin_head, origin_body, body_h_b_list) : 411 | ''' 412 | When the header comes at the top of the body 413 | ''' 414 | head_index = [] 415 | body_index = [] 416 | for i, decision in enumerate(body_h_b_list) : 417 | if decision == 'head' : 418 | head_index.append(i) 419 | else : 420 | body_index.append(i) 421 | 422 | head = [] 423 | body = [] 424 | 425 | for h in head_index : 426 | head.append(origin_body[h]) 427 | for b in body_index : 428 | body.append(origin_body[b]) 429 | 430 | for i, row in enumerate(head) : 431 | formatted_list = [f'{item}' for item in row] 432 | result_head = ' '.join(formatted_list) 433 | result_head = ' ' + result_head + ' ' 434 | 435 | if origin_head == None : 436 | modified_head = '' + result_head + '' 437 | 438 | else : 439 | origin_head = str(origin_head) 440 | modified_head = origin_head.replace('', '') 441 | modified_head = modified_head + result_head + '' 442 | 443 | for i, row in enumerate(body) : # body html 형식으로 변환 444 | formatted_list = [f'{item}' for item in row] 445 | result_body = ' '.join(formatted_list) 446 | result_body = ' ' + result_body + ' ' 447 | final_body = '' + result_body + '' # 행별로 tbody 붙여줌 448 | 449 | padded_index = str(i + 1).zfill(2) 450 | 451 | html_string = f"
{str(modified_head)}{str(final_body)}
" 452 | with open(self.save_path + f"/{file_name}_{padded_index}.html", "w", encoding="utf-8") as file: 453 | file.write(html_string) 454 | 455 | 456 | def case_2(self, origin_head, origin_body, body_h_b_list) : 457 | ''' 458 | When the structure of the header repeats within the body 459 | ''' 460 | table_seperate_index = [] 461 | 462 | for i, hb in enumerate(body_h_b_list) : 463 | if hb == 'head' : 464 | table_seperate_index.append(i) 465 | 466 | table_seperate_index_result = [] 467 | current_sequence = [] 468 | 469 | for num in table_seperate_index: 470 | if not current_sequence or num == current_sequence[-1] + 1: 471 | current_sequence.append(num) 472 | else: 473 | table_seperate_index_result.append(current_sequence[0]) 474 | current_sequence = [num] 475 | 476 | if current_sequence: 477 | table_seperate_index_result.append(current_sequence[0]) 478 | 479 | table_split = self.split_list_by_indexes(origin_body, table_seperate_index_result) 480 | index_split = self.split_list_by_indexes(body_h_b_list, table_seperate_index_result) 481 | total_table = [] 482 | for table_num in range(0, len(index_split)) : 483 | 484 | if 'head' not in index_split[table_num] : 485 | body_string_list = [] 486 | for row_ in table_split[table_num] : 487 | 488 | for i, row in enumerate(row_) : # head html 형식으로 변환 489 | formatted_list = [f'{item}' for item in row_] 490 | result_ = ' '.join(formatted_list) 491 | result_ = ' ' + result_+ ' ' 492 | body_string_list.append(result_) 493 | 494 | origin_head = str(origin_head) 495 | for table_ in body_string_list : 496 | table_ = origin_head + table_ 497 | total_table.append(table_) 498 | else : 499 | table_head = [] 500 | table_body = [] 501 | 502 | for row_index in range(0, len(table_split[table_num])) : 503 | 504 | if index_split[table_num][row_index] == 'head' : 505 | 506 | formatted_list = [f'{item}' for item in table_split[table_num][row_index]] 507 | result_ = ' '.join(formatted_list) 508 | result_ = ' ' + result_ + ' ' 509 | table_head.append(result_) 510 | 511 | else : 512 | formatted_list = [f'{item}' for item in table_split[table_num][row_index]] 513 | result_ = ' '.join(formatted_list) 514 | result_ = ' ' + result_ + ' ' 515 | table_body.append(result_) 516 | 517 | table_head = ''.join(table_head) 518 | for body_row in table_body : 519 | body_row = '' + body_row + '' 520 | f_table = '' + table_head + '' + body_row 521 | total_table.append(f_table) 522 | 523 | for i, row in enumerate(total_table) : 524 | 525 | padded_index = str(i + 1).zfill(2) 526 | 527 | html_string = f"
{str(row)}
" 528 | with open(self.save_path + f"/{file_name}_{padded_index}.html", "w", encoding="utf-8") as file: 529 | file.write(html_string) 530 | 531 | def case_3(self, origin_head, origin_body, body_h_b_list) : 532 | ''' 533 | When the common header comes in the header and the sub-header comes in the body 534 | ''' 535 | table_seperate_index = [] 536 | 537 | for i, hb in enumerate(body_h_b_list) : 538 | if hb == 'head' : 539 | table_seperate_index.append(i) 540 | 541 | table_seperate_index_result = [] 542 | current_sequence = [] 543 | 544 | for num in table_seperate_index: 545 | if not current_sequence or num == current_sequence[-1] + 1: 546 | current_sequence.append(num) 547 | else: 548 | table_seperate_index_result.append(current_sequence[0]) 549 | current_sequence = [num] 550 | 551 | if current_sequence: 552 | table_seperate_index_result.append(current_sequence[0]) 553 | 554 | table_seperate_index_result.pop(0) 555 | table_split = self.split_list_by_indexes(origin_body, table_seperate_index_result) 556 | index_split = self.split_list_by_indexes(body_h_b_list, table_seperate_index_result) 557 | 558 | table_final = [] 559 | for split_index in range(0, len(table_split)) : 560 | table_ =[] 561 | for row_index in range(0, len(table_split[split_index])) : 562 | table_head = [] 563 | table_body = [] 564 | 565 | if index_split[split_index][row_index] == 'head' : 566 | 567 | formatted_list = [f'{item}' for item in table_split[split_index][row_index]] 568 | result_ = ' '.join(formatted_list) 569 | result_ = ' ' + result_ + ' ' 570 | table_head.append(result_) 571 | 572 | else : 573 | formatted_list = [f'{item}' for item in table_split[split_index][row_index]] 574 | result_ = ' '.join(formatted_list) 575 | result_ = ' ' + result_ + ' ' 576 | table_body.append(result_) 577 | 578 | if table_head != [] : 579 | table_.append(table_head) 580 | else : 581 | table_.append(table_body) 582 | 583 | head_index = list(filter(lambda x: index_split[split_index][x] == 'head', range(len([split_index])))) 584 | body_index = [] 585 | for i in range(0, len(index_split[split_index])) : 586 | if i not in head_index : 587 | body_index.append(i) 588 | 589 | head_string = '' 590 | body_string_list = [] 591 | for head in head_index : 592 | head_string += table_[head][0] 593 | 594 | if origin_head == None : 595 | head_string = '' + str(head_string) + '' 596 | else : 597 | origin_head = str(origin_head) 598 | origin_head = origin_head.replace("", " ") 599 | head_string = origin_head + head_string + '' 600 | 601 | for body in body_index : 602 | body_string_list.append('' + table_[body][0] + '') 603 | 604 | for fi in body_string_list : 605 | table_final.append(head_string + fi) 606 | 607 | 608 | for i, row in enumerate(table_final) : 609 | padded_index = str(i + 1).zfill(2) 610 | 611 | html_string = f"
{str(row)}
" 612 | with open(self.save_path + f"/{file_name}_{padded_index}.html", "w", encoding="utf-8") as file: 613 | file.write(html_string) 614 | 615 | 616 | def normal(self, origin_head, origin_body) : 617 | ''' 618 | Normal table 619 | ''' 620 | table_final = [] 621 | f_body = [] 622 | origin_head = str(origin_head) 623 | for row in origin_body : 624 | formatted_list = [f'{item}' for item in row] 625 | result_ = ' '.join(formatted_list) 626 | result_ = ' ' + result_ + ' ' 627 | f_body.append(result_) 628 | 629 | for f in f_body : 630 | f = origin_head + f 631 | table_final.append(f) 632 | 633 | 634 | for i, row in enumerate(table_final) : 635 | padded_index = str(i + 1).zfill(2) 636 | 637 | html_string = f"
{str(row)}
" 638 | with open(self.save_path + f"/{file_name}_{padded_index}.html", "w", encoding="utf-8") as file: 639 | file.write(html_string) 640 | 641 | def run(self) : 642 | ''' 643 | Determining the type of table, 644 | then splitting it according to the rule. 645 | ''' 646 | f_name = self.html_path.split('/')[-1] 647 | print(f_name) 648 | 649 | origin_body = self.load_pickle() 650 | origin_head = self.head_tag() 651 | row_h_b_list = self.head_body_decision_making() 652 | print(origin_body) 653 | if all(element == 'body' for element in row_h_b_list) : 654 | ######### normal ######### 655 | self.normal(origin_head, origin_body) 656 | 657 | elif row_h_b_list.count('head') > 6 : 658 | print('outlier') 659 | 660 | else : 661 | if row_h_b_list[0] == 'head' : 662 | last_head = len(row_h_b_list) - row_h_b_list[::-1].index('head') - 1 663 | 664 | if 'body' not in row_h_b_list : 665 | print('outlier') 666 | 667 | else : 668 | if last_head == row_h_b_list.count('head') - 1: 669 | ######### case 1 ######### 670 | self.case_1(origin_head, origin_body, row_h_b_list) 671 | 672 | else : 673 | ######### case 2 ######### 674 | if origin_head == None : 675 | self.case_2(origin_head, origin_body, row_h_b_list) 676 | ######### case 3 ######### 677 | else : 678 | self.case_3(origin_head, origin_body, row_h_b_list) 679 | else : 680 | ######### case 2 ######### 681 | self.case_2(origin_head, origin_body, row_h_b_list) 682 | --------------------------------------------------------------------------------