├── GPT_models
    ├── follow_up_q.py
    └── models.py
├── README.md
├── data
    ├── non_split
    │   ├── table_html
    │   │   └── example_tbl01.html
    │   └── table_json
    │   │   └── example_tbl01.json
    ├── pickle_folder
    │   └── example_tbl01.pickle
    └── split
    │   ├── json_representation
    │       ├── example_tbl01_01.json
    │       ├── example_tbl01_02.json
    │       ├── example_tbl01_03.json
    │       ├── example_tbl01_04.json
    │       ├── example_tbl01_05.json
    │       ├── example_tbl01_06.json
    │       ├── example_tbl01_07.json
    │       ├── example_tbl01_08.json
    │       ├── example_tbl01_09.json
    │       ├── example_tbl01_10.json
    │       ├── example_tbl01_11.json
    │       ├── example_tbl01_12.json
    │       └── example_tbl01_13.json
    │   ├── table_split_html
    │       ├── example_tbl01_01.html
    │       ├── example_tbl01_02.html
    │       ├── example_tbl01_03.html
    │       ├── example_tbl01_04.html
    │       ├── example_tbl01_05.html
    │       ├── example_tbl01_06.html
    │       ├── example_tbl01_07.html
    │       ├── example_tbl01_08.html
    │       ├── example_tbl01_09.html
    │       ├── example_tbl01_10.html
    │       ├── example_tbl01_11.html
    │       ├── example_tbl01_12.html
    │       └── example_tbl01_13.html
    │   ├── table_split_json
    │       ├── example_tbl01_01.json
    │       ├── example_tbl01_02.json
    │       ├── example_tbl01_03.json
    │       ├── example_tbl01_04.json
    │       ├── example_tbl01_05.json
    │       ├── example_tbl01_06.json
    │       ├── example_tbl01_07.json
    │       ├── example_tbl01_08.json
    │       ├── example_tbl01_09.json
    │       ├── example_tbl01_10.json
    │       ├── example_tbl01_11.json
    │       ├── example_tbl01_12.json
    │       └── example_tbl01_13.json
    │   └── tsv_representation
    │       ├── example_tbl01_01.txt
    │       ├── example_tbl01_02.txt
    │       ├── example_tbl01_03.txt
    │       ├── example_tbl01_04.txt
    │       ├── example_tbl01_05.txt
    │       ├── example_tbl01_06.txt
    │       ├── example_tbl01_07.txt
    │       ├── example_tbl01_08.txt
    │       ├── example_tbl01_09.txt
    │       ├── example_tbl01_10.txt
    │       ├── example_tbl01_11.txt
    │       ├── example_tbl01_12.txt
    │       └── example_tbl01_13.txt
├── input_generation_script.json
├── model_evaluation
    ├── evaluation.py
    └── utils
    │   ├── __pycache__
    │       ├── functions.cpython-37.pyc
    │       ├── get_keys.cpython-37.pyc
    │       ├── get_keys_function.cpython-37.pyc
    │       └── get_keys_function.cpython-38.pyc
    │   └── functions.py
├── model_script.json
├── requirements_conda.txt
├── requirements_pip.txt
├── run.py
├── table_representation
    ├── __pycache__
    │   ├── table2json_upload.cpython-37.pyc
    │   └── table_representer_upload.cpython-37.pyc
    ├── table2json.py
    └── table_representer.py
└── table_splitting
    ├── __pycache__
        └── split_table_.cpython-37.pyc
    └── split_table.py


/GPT_models/follow_up_q.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os 
  3 | import openai
  4 | import pandas as pd
  5 | 
  6 | openai.api_key = "your api key"
  7 | 
  8 | class FollowQ:
  9 |     def __init__(self, json_path, representation_path, save_path):
 10 |         self.json_path = json_path
 11 |         self.representation_path = representation_path
 12 |         self.save_path = save_path
 13 |         
 14 |         # Questions related to catalysts
 15 |         self.questions_for_catalyst = {
 16 |             "1_list": ["representation", "Question 1. Please tell me the names of catalysts in the contents within the <table> tags of input representation into a Python list. Give me the names of catalysts ONLY. Only output the Python list(like string).\n\n"],
 17 |             "2_list": ["json", "Question 2. Please tell me the names of catalysts from the input json provided by me into a Python list. Only output the Python list(like string).\n\n"],
 18 |             "3_list": ["None", "Question 3. Based on the answer to Question 1, modify or remove any catalysts from the answer to Question 2 and provide the updated list in Python. Give me the names of catalysts ONLY. Only output the Python list.(like string)\n\n"]
 19 |             } 
 20 |         
 21 |         # Questions related to performance
 22 |         self.questions_for_performance = {
 23 |             "1_list": ["json", "Question 1. Inform me about what performance type does {catalyst_name} have in the input json I provided? Only output the Python list.(like string)\n\n"]
 24 |         }
 25 |         
 26 |         # Questions related to properties
 27 |         self.questions_for_property = {
 28 |             "1_both": ["json", "Question 1. Provide detailed information about all sublayers of the {element} of {catalyst_name} in input json. Remove keys from the dictionary that do not have a value. Present it in either Python list or JSON format. If the {element} is not 'loading', strictly provide it in Python list or JSON format.(like string not ```python and not ```json)\n\n"],
 29 |             "2_str": ["None", "Question 2. If there is any occurrence of 'NA', 'na', 'unknown', or similar content in your recent response, Respond with yes or no. You must answer strictly with yes or no.\n\n"],
 30 |             "3_dict": ["None", "Question 3. In the answer to question 1, remove any parts corresponding to 'NA', 'na', 'unknown', or similar contents. Show the modified JSON. Only display the JSON. (like string not ```json)\n\n"],
 31 |             "4_list": ["representation", "Question 4. Based on the input representation, provide values of the {element} of the {catalyst_name} as a Python list. If there is a unit, please provide strictly the value including the unit. The elements of a Python list must be composed of value plus unit. Only output the Python list.(like string not ```python)\n\n"],
 32 |             "5_list": ["None", "Question 5. Based on the answer to question 3, provide values of the '''value''' key of the sublayers of the {element} as a Python list. Only output the Python list.(like string not ```python)\n\n"],
 33 |             "6_list": ["None", "Question 6. Based on only numerical values, provide a list of elements that exist in the answer to Question 5 but are not present in the the answer to Question 4. Note that unit differences can be ignored if the numbers match. Only output the Python list.(like string not ```python)\n\n"],
 34 |             "7_dict": ["json", "Question 7. If elements included in the list that is the answer to question 6 are in the answer to question 1, remove the sub-dictionary containing those elements from the json I provided. If the answer to 6 is a list containing elements, be sure to delete it from json. Show the modified JSON after removal. Only display the JSON. (like string not ```json)\n\n"],
 35 |             "8_dict": ["json", "Question 8. Please tell me the final modified json of {catalyst_name} by reflecting the answer to question 7 in the json I provided. Only output the JSON of {catalyst_name}. the catalyst_name is {catalyst_name}. The first key of the dictionary should be {catalyst_name}. Remove keys from the dictionary that do not have a value. (like string not ```json)"]
 36 |             } 
 37 | 
 38 |         # Questions related to electrolyte, reaction_type, substrate
 39 |         self.questions_for_representation = {
 40 |             "1_str": ["representation_title", "Question 1. If there is any occurrence of 'OER', 'HER', 'oxygen evolution reaction', 'hydrogen evolution reaction' or similar contents in input representation, respond with yes or no. Please answer with either yes or no.\n\n"],
 41 |             "2_str": ["representation_table_caption", "Question 2. If there is any occurrence of 'OER', 'HER', 'oxygen evolution reaction', 'hydrogen evolution reaction' or similar contents in input representation, respond with yes or no. Please answer with either yes or no.\n\n"],
 42 |             "3_str": ["representation_title", "Question 3. Does the input representation contain information corresponding to substrate? Please answer with either yes or no\n\n"],
 43 |             "4_str": ["representation_table_caption", "Question 4. Does the input representation contain information corresponding to substrate? Please answer with either yes or no\n\n"],
 44 |             "5_str": ["representation_title", "Question 5. Does the input representation contain information corresponding to electrolyte? Please answer with either yes or no\n\n"],
 45 |             "6_str": ["representation_table_caption", "Question 6. Does the input representation contain information corresponding to electrolyte? Please answer with either yes or no\n\n"]
 46 |             } 
 47 |         
 48 |         # system prompt  
 49 |         self.system_prompt = {"role": "system", "content": "You need to modify the JSON representing the table presenter.\n\n JSON templete : {'catalyst_name' : {'performance_name' : \{property templete\}}}\n property templete : {'electrolyte': '', 'reaction_type': '', 'value': '', 'current_density': '', 'overpotential': '',  'potential': '', 'substrate': '', 'versus': '', 'condition': ''}\n performance_list = [overpotential, tafel_slope, Rct, stability, Cdl, onset_potential, potential, TOF, ECSA, water_splitting_potential, mass_activity, exchange_current_density, Rs, specific_activity, onset_overpotential, BET, surface_area, loading, apparent_activation_energy]\n In the JSON template, 'catalyst_name' and 'performance_name' should be replaced with the actual names present in the input representation."}
 50 |     
 51 |     def input_prompt(self, representation, json, want_type): 
 52 |         """
 53 |         Generates a formatted input string based on the type of content requested.
 54 | 
 55 |         Parameters:
 56 |         representation (str): The input representation string containing HTML content.
 57 |         json (str): The JSON string to be included in the input.
 58 |         want_type (str): The type of content required ('both', 'representation', 'json', 'representation_title', 'representation_table_caption').
 59 | 
 60 |         Returns:
 61 |         str: The formatted input string.
 62 |         """ 
 63 |         
 64 |         # Split the representation string at each occurrence of '</table>'
 65 |         splitted_strings = representation.split('</table>')
 66 |         # Determine the format based on the requested type
 67 |         if want_type == 'both':
 68 |             format_for_input = "<input representation>\n" + str(representation) + "\n<input json>\n" + str(json) + "\n\n"
 69 |         elif want_type == 'representation':
 70 |             format_for_input = "<input representation>\n" + str(representation) + "\n\n"
 71 |         elif want_type == 'json':
 72 |             format_for_input = "<input json>\n" + str(json) + "\n\n"
 73 |         elif want_type == 'representation_title':
 74 |             title_string = splitted_strings[0] + '</table>'
 75 |             format_for_input = "<input representation>\n" + str(title_string) + "\n\n"
 76 |         elif want_type == 'representation_table_caption':
 77 |             table_caption_string = splitted_strings[1]
 78 |             format_for_input = "<input representation>\n" + str(table_caption_string) + "\n\n"      
 79 |                   
 80 |         return format_for_input
 81 |     
 82 |     def load_file(self, file_type, file_path, file_name):
 83 |         """
 84 |         Loads the content of a file based on the specified type and path.
 85 | 
 86 |         Parameters:
 87 |         file_type (str): The type of the file (json, html, txt, etc.)
 88 |         file_path (str): The path to the file
 89 |         file_name (str): The name of the file
 90 | 
 91 |         Returns:
 92 |         output: The content of the file. The type of the content depends on the file_type.
 93 |         
 94 |         Raises:
 95 |         ValueError: If the file type is unsupported.
 96 |         """
 97 |         with open(file_path + file_name, 'r', encoding='utf-8-sig') as f:
 98 |             # Load JSON files
 99 |             if file_type == 'json':
100 |                 output = json.load(f)
101 |             # Read text or HTML files
102 |             elif file_type in ['html', 'txt']:
103 |                 output = f.read()
104 |             # Raise an error for unsupported file formats
105 |             else:
106 |                 raise ValueError("Unsupported file format.")
107 |         
108 |         return output
109 |           
110 |     def formatting_type(self, key, answer):
111 |         """
112 |         Formats the answer based on the specified key type.
113 | 
114 |         Parameters:
115 |         key (str): The key indicating the type of format required (e.g., '1_list', '2_dict').
116 |         answer (str): The answer to be formatted.
117 | 
118 |         Returns:
119 |         The formatted answer, which can be a list or a dictionary based on the key type.
120 |         """
121 |         # Determine the desired format type from the key
122 |         want_type = key.split('_')[1]
123 |         
124 |         # Handle formatting if the desired type is a list
125 |         if want_type == "list":
126 |             if answer[0] == '"' and answer[-1] == '"':
127 |                 answer = answer.strip('"')  # Remove surrounding quotes if present
128 |             answer = eval(answer)  # Evaluate the string as a Python expression (e.g., convert to list)
129 |         
130 |         # Handle formatting if the desired type is a dictionary
131 |         elif want_type == "dict":
132 |             if "```" in answer:
133 |                 answer = answer.replace("```json", "").replace("```", "")  # Remove markdown code block formatting
134 |             answer = json.loads(answer)  # Parse the string as JSON
135 |         
136 |         return answer
137 |     
138 |     def check_type(self, key, answer):
139 |         """
140 |         Checks if the type of the answer matches the expected type based on the key.
141 | 
142 |         Parameters:
143 |         key (str): The key indicating the expected type (e.g., '1_list', '2_dict').
144 |         answer (any): The answer whose type needs to be checked.
145 | 
146 |         Returns:
147 |         tuple: A tuple containing the expected type (str) and a boolean indicating whether the types match.
148 |         """
149 |         # Extract the question number and expected type from the key
150 |         question_number = key.split('_')[0]
151 |         want_type = key.split('_')[1]
152 |         
153 |         # Determine the actual type of the answer
154 |         answer_type = type(answer).__name__
155 |         
156 |         # Check if the expected type matches the actual type
157 |         type_bool = want_type == answer_type
158 |         
159 |         return want_type, type_bool
160 |         
161 |     def prompt(self, Q):
162 |         """
163 |         Sends a list of messages to the OpenAI ChatCompletion API and returns the response.
164 | 
165 |         Parameters:
166 |         Q (list): A list of message dictionaries to be sent to the API.
167 | 
168 |         Returns:
169 |         tuple: A tuple containing the original list of messages (Q) and the response content.
170 |         """
171 |         while True:
172 |             try:
173 |                 # Send a request to the OpenAI ChatCompletion API
174 |                 response = openai.ChatCompletion.create(
175 |                     model="gpt-4-0125-preview",
176 |                     messages=Q,
177 |                     temperature=0,
178 |                     frequency_penalty=0,
179 |                     presence_penalty=0
180 |                 )
181 |                 break  # Exit the loop if the request is successful
182 |             except Exception as e:
183 |                 # Print the error message and retry
184 |                 print("An error occurred:", e)
185 |         
186 |         # Return the original messages and the content of the response
187 |         return Q, response['choices'][0]['message']['content']
188 | 
189 |     def run(self, input_type, split_mode):
190 |         """
191 |         Executes the main process for modifying JSON files based on given questions.
192 | 
193 |         Parameters:
194 |         input_type (str): The type of input files (e.g., 'json', 'txt').
195 |         split_mode (str): The mode for handling splits (e.g., 'split', 'no-split').
196 | 
197 |         Returns:
198 |         None
199 |         """
200 |         json_name_list = os.listdir(self.json_path)
201 |         error_file_name = []
202 |         transpose_bool = False
203 |         
204 |         for json_name in json_name_list:
205 |             # try:
206 |                 txt_name = json_name.split('.')[0]
207 |                 # Load input files
208 |                 input_json = self.load_file('json', self.json_path, json_name)
209 |                 input_representation = self.load_file(input_type, self.representation_path, txt_name + '.txt')
210 |                 
211 |                 # Initialize messages with system prompt 
212 |                 messages = [self.system_prompt]
213 | 
214 |                 # Catalyst check
215 |                 catalyst_result = {}
216 |                 question_list = []
217 |                 answer_list = []
218 |                 question_token_list = []
219 |                 answer_token_list = []
220 |                 final_result = []
221 |                 final_json = {"catalysts": []}
222 | 
223 |                 # Iterate through catalyst questions
224 |                 for key, value in self.questions_for_catalyst.items():
225 |                     if value[0] != "None":
226 |                         messages.append({"role": "user", "content": self.input_prompt(input_representation, input_json, value[0])})
227 |                     messages.append({"role": "user", "content": value[1]})
228 |                     messages, answer = self.prompt(messages)
229 |                     question_token_list.append(messages)
230 |                     answer_token_list.append(answer)
231 |                     
232 |                     try:
233 |                         mod_answer = self.formatting_type(key, answer)
234 |                     except Exception as e:
235 |                         try:
236 |                             messages, answer = self.prompt(messages)
237 |                             question_token_list.append(messages)
238 |                             answer_token_list.append(answer)
239 |                             question_list.append("Throwing the same message once again")
240 |                             answer_list.append("Throwing the same message once again")
241 |                         except Exception as e:
242 |                             error_file_name.append(json_name)
243 |                             print(json_name)
244 |                             break
245 |                         
246 |                     catalyst_result[key[0]] = mod_answer
247 |                     question_list.append(value[1])
248 |                     answer_list.append(mod_answer)
249 | 
250 |                     print('--------------------------')
251 |                     print(self.questions_for_catalyst[key])
252 |                     print(mod_answer)
253 |                     print(catalyst_result)
254 |                     print('--------------------------')
255 | 
256 |                     messages.append({"role": "assistant", "content": answer})
257 |                     if key[0] == '2' and catalyst_result["1"] == catalyst_result["2"]:
258 |                         print("GO TO THE PERFORMANCE STAGE !!")
259 |                         break   
260 |                         
261 |                 # Performance check
262 |                 performance_result = {}
263 |                 mod_catalyst_list = answer_list[-1]
264 | 
265 |                 if len(mod_catalyst_list) > 1:
266 |                     transpose_bool = True
267 |                 
268 |                 for i in range(len(mod_catalyst_list)):
269 |                     messages = [self.system_prompt]
270 |                     for key, value in self.questions_for_performance.items():
271 |                         if value[0] != "None":
272 |                             messages.append({"role": "user", "content": self.input_prompt(input_representation, input_json, value[0])})
273 |                         question = value[1].format(catalyst_name = '"""'+ mod_catalyst_list[i] +'"""')
274 |                         messages.append({"role": "user", "content": question})
275 |                         messages, answer = self.prompt(messages) 
276 |                         question_token_list.append(messages)
277 |                         answer_token_list.append(answer)
278 |                         
279 |                         try:
280 |                             mod_answer = self.formatting_type(key, answer)  
281 |                         except:
282 |                             try:
283 |                                 messages, answer = self.prompt(messages) 
284 |                                 question_token_list.append(messages)
285 |                                 answer_token_list.append(answer)
286 |                             except:
287 |                                 error_file_name.append(json_name)
288 | 
289 |                                 break       
290 |                             
291 |                         performance_result[f'{mod_catalyst_list[i]}_{key[0]}'] = mod_answer
292 |                         question_list.append(question)
293 |                         answer_list.append(mod_answer)
294 |                         
295 |                         messages.append({"role": "assistant", "content": answer})
296 |                         if key[0] == '3' and performance_result[f'{mod_catalyst_list[i]}_3'] == []:
297 |                             print("GO TO THE PROPERTY STAGE !!")
298 |                             break
299 |                                 
300 |                     if isinstance(answer_list[-1], list):
301 |                         mod_performance_list = performance_result[f'{mod_catalyst_list[i]}_1']
302 |                     else:
303 |                         while not isinstance(answer_list[-1], list):
304 |                             messages.pop()
305 |                             messages, answer = self.prompt(messages)
306 |                             question_token_list.append(messages)
307 |                             answer_token_list.append(answer)
308 |                             mod_answer = self.formatting_type("1_list", answer)
309 |                             question_list.append(question)
310 |                             answer_list.append(mod_answer)
311 |                             mod_performance_list = answer
312 |                             
313 |                     # Property check       
314 |                     property_result = {}
315 |                     skip_questions = False
316 |                     print("#####################")
317 |                     print(mod_performance_list)
318 |                     
319 |                     if mod_performance_list == []:
320 |                         mod_answer = {str(mod_catalyst_list[i]): {}}
321 |                     else:    
322 |                         for j in range(len(mod_performance_list)):
323 |                             messages = []
324 |                             messages.append(self.system_prompt)
325 |                             print("@@@@@@@@@@@@")
326 |                             print(mod_performance_list[j])
327 |                             # system prompt 넣어주고, input 표현자, json 넣어주는 코드 
328 |                             # messages = []
329 |                             # messages.append(self.system_prompt)
330 |                         
331 |                             skip_questions = False
332 |                             anwer3_no = False
333 |                             for key, value in self.questions_for_property.items():
334 |                                 # 질문 2의 답변이 "no"일 때 질문 3번과 4번 skip하기 위한 조건문
335 |                                 if key[0] in ['3','7'] and skip_questions:
336 |                                     print("SKIP THE NEXT QUESTIONS")
337 |                                     question_list.append("SKIP THE NEXT QUESTIONS")
338 |                                     answer_list.append("SKIP THE NEXT QUESTIONS")
339 |                                     if key[0] in ['3','7']:
340 |                                         skip_questions = False
341 |                                     continue  # 질문 3번과 4번 건너뛰기
342 |                                     
343 |                                 if value[0] != "None":
344 |                                     messages.append({"role": "user", "content": self.input_prompt(input_representation, input_json, value[0])})
345 |                                     
346 |                                 question = value[1].format(catalyst_name = '"""'+ mod_catalyst_list[i] +'"""', element = '"""'+ mod_performance_list[j] +'"""')
347 |                                 messages.append({"role": "user", "content": question})
348 |                                 messages, answer = self.prompt(messages) 
349 |                                 question_token_list.append(messages)
350 |                                 answer_token_list.append(answer)
351 |                                 try:
352 |                                     mod_answer = self.formatting_type(key, answer)  
353 |                                 except:
354 |                                     try:
355 |                                         messages, answer = self.prompt(messages) 
356 |                                         question_token_list.append(messages)
357 |                                         answer_token_list.append(answer)
358 |                                     except:
359 |                                         print(json_name)
360 |                                         error_file_name.append(json_name)
361 |                                         break   
362 |                                     
363 |                                 property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_' + key[0]] = answer
364 |                                 question_list.append(question)
365 |                                 answer_list.append(mod_answer)
366 |                                 
367 |                                 print('--------------------------')
368 |                                 print(question)
369 |                                 print(mod_answer)
370 |                                 print('--------------------------')
371 | 
372 |                                 messages.append({"role": "assistant", "content": answer})
373 |                                 
374 |                                 if key[0] == '2' and mod_answer.lower() == "no":
375 |                                     question_list.append("Question 3. Based on the answer to question 2, remove any parts corresponding to 'NA', 'na', 'unknown', or similar content from the answer to question 1. Show the modified JSON. Only display the JSON. (like string not ```json)")
376 |                                     answer_list.append(str(property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_1']))
377 |                                     messages.append({"role": "user", "content": "Question 3. Based on the answer to question 2, remove any parts corresponding to 'NA', 'na', 'unknown', or similar content from the answer to question 1. Show the modified JSON. Only display the JSON. (like string not ```json)"})    
378 |                                     messages.append({"role": "assistant", "content": property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_1']})                       
379 |                                     skip_questions = True
380 |                                     anwer3_no = True  
381 |                                       
382 |                                 if key[0] == '6' and mod_answer == []:
383 |                                     if anwer3_no:
384 |                                         question_list.append("Question 7. If the answer to question 6 is an empty list, just provide the answer to question 1 as it is.")
385 |                                         answer_list.append(str(property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_1']))
386 |                                         messages.append({"role": "user", "content": "Question 7. If the answer to question 6 is an empty list, just provide the answer to question 1 as it is."})    
387 |                                         messages.append({"role": "assistant", "content": property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_1']})                       
388 |                                         skip_questions = True    
389 |                                     else:
390 |                                         question_list.append("Question 7. If the answer to question 6 is an empty list, just provide the answer to question 3 as it is.")
391 |                                         answer_list.append(str(property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_3']))
392 |                                         messages.append({"role": "user", "content": "Question 7. If the answer to question 6 is an empty list, just provide the answer to question 3 as it is."})    
393 |                                         messages.append({"role": "assistant", "content": property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_3']})                       
394 |                                         skip_questions = True                                           
395 |                                                 
396 |                             if isinstance(mod_answer, dict):
397 |                                 input_json = mod_answer
398 |                             else:
399 |                                 count = 0
400 |                                 while not isinstance(mod_answer, dict) and count < 3:
401 |                                     messages.pop()
402 |                                     messages.append({"role": "user", "content": self.input_prompt(input_representation, input_json, 'json')})
403 |                                     messages, answer = self.prompt(messages)
404 |                                     question_token_list.append(messages)
405 |                                     answer_token_list.append(answer)
406 |                                     try:
407 |                                         mod_answer = self.formatting_type(key, answer)  
408 |                                     except:
409 |                                         try:
410 |                                             messages, answer = self.prompt(messages) 
411 |                                             question_token_list.append(messages)
412 |                                             answer_token_list.append(answer)
413 |                                         except:
414 |                                             print('#$@%@#%@#%#@%@#%@#')
415 |                                             print(json_name)
416 |                                             print('#$@%@#%@#%#@%@#%@#')
417 |                                             error_file_name.append(json_name)
418 |                                             break   
419 |                                         
420 |                                     count += 1  
421 |                                     
422 |                                     property_result[mod_catalyst_list[i] + '_' + mod_performance_list[j] + '_' + key[0]] = answer
423 |                                     question_list.append(question)
424 |                                     answer_list.append(mod_answer)
425 |                                     
426 |                                     print('--------------------------')
427 |                                     print(question)
428 |                                     print(mod_answer)
429 |                                     print('--------------------------')
430 |                                      
431 |                                     messages.append({"role": "assistant", "content": answer})
432 |                                                     
433 |                     if len(mod_catalyst_list) == 1 and split_mode == 'split':
434 |                         final_result.append(mod_answer)
435 |                     else:
436 |                         input_json = self.load_file(input_type, self.json_path, json_name)
437 |                         final_json["catalysts"].append(mod_answer)
438 | 
439 |                 if transpose_bool:
440 |                     final_result.append(final_json)
441 |                     
442 |                 # Final JSON after property modifications
443 |                 if final_result[0] == []:
444 |                     input_json = self.load_file('json', self.json_path, json_name)     
445 |                     new_json = input_json            
446 |                 else:
447 |                     new_json = final_result[0] 
448 |                     
449 |                 # Handle representation questions
450 |                 remove_elements = []
451 |                 representation_result = {}
452 |                 for key, value in self.questions_for_representation.items():    
453 |                     messages = []                    
454 |                     if value[0] == "None":
455 |                         pass
456 |                     else:  
457 |                         messages.append({"role": "user", "content": self.input_prompt(input_representation, new_json, value[0])})
458 | 
459 |                     question = value[1]   
460 |                     messages.append({"role": "user", "content": question})
461 |                     messages, answer = self.prompt(messages) 
462 |                     question_token_list.append(messages)
463 |                     answer_token_list.append(answer)
464 | 
465 |                     mod_answer = self.formatting_type(key, answer)  
466 | 
467 |                     question_list.append(question)
468 |                     answer_list.append(mod_answer)
469 |                     
470 |                     print('--------------------------')
471 |                     print(question)
472 |                     print(mod_answer)
473 |                     print('--------------------------')
474 | 
475 |                     messages.append({"role": "assistant", "content": answer})
476 |                     representation_result[key[0]] = mod_answer.replace(".", "").lower()
477 |                     
478 |                 if representation_result['1'] == 'no' and representation_result['2'] == 'no':
479 |                     remove_elements.append('reaction_type')
480 |                 if representation_result['3'] == 'no' and representation_result['4'] == 'no':
481 |                     remove_elements.append('substrate')
482 |                 if representation_result['5'] == 'no' and representation_result['6'] == 'no':
483 |                     remove_elements.append('electrolyte')
484 |                 
485 |                 remove_elements = list(set(remove_elements))
486 |                 
487 |                 if remove_elements != []:    
488 |                     for delete_element in remove_elements:
489 |                         messages = []
490 |                         messages.append({"role": "user", "content": self.input_prompt(input_representation, new_json, 'json')})
491 |                         question = "Remove all elements with the key name {delete_element} from the input JSON and display it in only JSON format. Other explanation is not allowed. Show me only JSON result. Only display the JSON. (like string not ```json)".format(delete_element="'''"+delete_element+"'''")
492 |                         messages.append({"role": "user", "content": question})
493 |                         messages, answer = self.prompt(messages)
494 |                         question_token_list.append(messages)
495 |                         answer_token_list.append(answer)
496 |                         try:
497 |                             mod_answer = self.formatting_type('1_dict', answer) 
498 |                         except:
499 |                             try:
500 |                                 messages, answer = self.prompt(messages) 
501 |                                 question_token_list.append(messages)
502 |                                 answer_token_list.append(answer)
503 |                             except:
504 |                                 print('#$@%@#%@#%#@%@#%@#')
505 |                                 print(json_name)
506 |                                 print('#$@%@#%@#%#@%@#%@#')
507 |                                 error_file_name.append(json_name)
508 |                                 break  
509 |                             
510 |                         question_list.append(question)
511 |                         answer_list.append(mod_answer)    
512 |                         print('--------------------------')
513 |                         print(question)
514 |                         print('answer')
515 |                         print(answer)
516 |                         print('mod answer')
517 |                         print(mod_answer)
518 |                         print('--------------------------')   
519 |                         
520 |                         if not isinstance(mod_answer, dict):  
521 |                             count = 0
522 |                             while not isinstance(mod_answer, dict) and count < 3:
523 |                                 # 원래 했던 질문 제거하고 다시 
524 |                                 messages, answer = self.prompt(messages)
525 |                                 question_token_list.append(messages)
526 |                                 answer_token_list.append(answer)
527 |                                 try:
528 |                                     mod_answer = self.formatting_type('1_dict', answer)  
529 |                                 except:
530 |                                     try:
531 |                                         messages, answer = self.prompt(messages) 
532 |                                         question_token_list.append(messages)
533 |                                         answer_token_list.append(answer)
534 |                                     except:
535 |                                         print('#$@%@#%@#%#@%@#%@#')
536 |                                         print(json_name)
537 |                                         print('#$@%@#%@#%#@%@#%@#')
538 |                                         error_file_name.append(json_name)
539 |                                         break   
540 |                                     
541 |                                 count += 1    
542 |                                 question_list.append(question)
543 |                                 answer_list.append(mod_answer)  
544 |                                   
545 |                                 print('--------------------------')
546 |                                 print(question)
547 |                                 print(mod_answer)
548 |                                 print('--------------------------')
549 |                                 
550 |                         if not isinstance(mod_answer, dict): 
551 |                             new_json = new_json
552 |                         else:                                       
553 |                             new_json = mod_answer
554 |                 else:
555 |                     mod_answer = new_json
556 | 
557 |                 # Ensure the necessary directories exist
558 |                 os.makedirs(os.path.join(self.save_path, 'log'), exist_ok=True)
559 |                 os.makedirs(os.path.join(self.save_path, 'token'), exist_ok=True)
560 |                 os.makedirs(os.path.join(self.save_path, 'json'), exist_ok=True)
561 |                 
562 |                 # Save the modified JSON and log
563 |                 if json_name not in error_file_name:
564 |                     log_path = self.save_path + 'log/'+ txt_name
565 |                     df = pd.DataFrame({'Question': question_list, 'GPT answer': answer_list})
566 |                     df.to_csv(log_path+'.csv', index=False)
567 |                     
568 |                     token_path = self.save_path + 'token/'+ txt_name
569 |                     token_df = pd.DataFrame({'Question': question_token_list, 'GPT answer': answer_token_list})
570 |                     token_df.to_csv(token_path+'.csv', index=False)
571 |                     
572 |                     new_json_path = self.save_path + 'json/'+ json_name
573 |                     if mod_answer == [] :
574 |                         input_json = self.load_file('json', self.json_path, json_name)
575 |                         with open(new_json_path, "w") as json_file:
576 |                             json.dump(input_json, json_file, indent=4)                        
577 |                     else:
578 |                         if isinstance(mod_answer, list):
579 |                             with open(new_json_path, "w") as json_file:
580 |                                 json.dump(new_json, json_file, indent=4)
581 |                                 
582 |                         elif isinstance(mod_answer, str):
583 |                             with open(new_json_path, "w") as json_file:
584 |                                 json.dump(new_json, json_file, indent=4)
585 |                         else:
586 |                             with open(new_json_path, "w") as json_file:
587 |                                 json.dump(mod_answer, json_file, indent=4)
588 | 
589 |                           
590 | if __name__ == "__main__":
591 |     representation_path = 'table representer path'
592 |     json_path = 'gpt prediction'
593 |     save_path = 'save path'
594 |     
595 |     assistant = FollowQ(json_path, representation_path, save_path)  
596 |     assistant.run('txt', 'split') 
597 |     
598 |         
599 | 


--------------------------------------------------------------------------------
/GPT_models/models.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import openai
  4 | import time
  5 | from ast import literal_eval
  6 | from copy import copy
  7 | 
  8 | def fine_tuning(input_path, output_path):
  9 | """
 10 | Test fine_tuning model, generates the prediction of table data extraction in json format.
 11 | You can use OPENAI PLAYGROUND for training the model. 
 12 | 
 13 | Parameters:
 14 | input_path : TSV or JSON table representation path
 15 | output_path : The path where the prediction is saved
 16 | 
 17 | Returns:
 18 | json : prediction of table data extraction
 19 | """     
 20 | 
 21 |   openai.api_key = "YOUR OPENAI KEY" 
 22 |   file_list = os.listdir(input_path)
 23 | 
 24 |   response = []
 25 |   for file in file_list:
 26 |       file_path = ''.join([output_path, file])
 27 |       file_name = os.path.basename(file_path)
 28 |       with open( input_path + file, 'r', encoding='utf-8-sig') as file:
 29 |           loaded_data_string = json.load(file) 
 30 |           
 31 |       completion = openai.ChatCompletion.create(
 32 |         model="FINE TUNED MODEL",
 33 |         temperature=0,
 34 |         messages=[
 35 |           {"role": "system", "content": "this task is to take a string as input and convert it to json format. I want to extract the performance below. [reaction_type, versus, overpotential, substrate, loading, tafel_slope, onset_potential, current_density, BET, specific_activity, mass_activity, surface_area, ECSA, apparent_activation_energy, water_splitting_potential, potential, Rs, Rct, Cdl, TOF, stability, electrolyte, exchange_current_density, onset_overpotential]. If there is information about overpotential and Tafel slope in the input, the output should be generated as follows.\n\n{\n\u201dcatalyst_name\": {\n\"overpotential\": {\n\"electrolyte\": \"1.0 M KOH\",\n\"reaction_type\": \"OER\",\n\"value\": \"230 mV\",\n\"current_density\": \"50 mA/cm2\"\n},\n\"tafel_slope\": {\n\"electrolyte\": \"1.0 M KOH\",\n\"reaction_type\": \"OER\",\n\"value\": \"54 mV/dec\"\n}\n\n}\n\n}\n\nIf information regarding the reaction_type or electrolyte cannot be found in the input, they should not be included in the output as follows.\n\n{\n\u201dcatalyst_name\": {\n\"overpotential\": {\n\"value\": \"230 mV\",\n\"current_density\": \"50 mA/cm2\"\n},\n\"tafel_slope\": {\n\"value\": \"54 mV/dec\"\n}\n\n}\n\n}\n\nIf the string is missing certain information such as 'mass_activity', ‘reaction_type’, ‘value’ or 'current_density', your output should not include those keys.\n\nIf there are no values corresponding to the mentioned performance metrics in the input, simply extract the catalyst name as shown below.\n\n{\n\u201dcatalyst_name\": {}\n}\n\nNote: The output should be a JSON object with keys for only the present metrics."},
 36 |           {"role": "user", "content": str(loaded_data_string)}
 37 |         ]
 38 |       )
 39 |       result = completion.choices[0].message['content']
 40 |       response.append(result)
 41 |       try:
 42 |           dict_1 = literal_eval(result)
 43 |           json_file_path = os.path.join(output_path, file_name)
 44 |           with open(file_path[:-5]+'.json', "w", encoding="utf-8-sig") as json_file:
 45 |               json.dump(dict_1, json_file, indent=4)
 46 |       except:
 47 |           with open(file_path[:-5]+'.txt', "w", encoding="utf-8-sig") as file:
 48 |               file.write(result)
 49 |               
 50 |               
 51 | 
 52 | def few_shot(input_path, output_path) :   
 53 | """
 54 | Test few shot model, generates the prediction of table data extraction in json format.
 55 | You need to give several I/O pairs.
 56 | 
 57 | Parameters:
 58 | input_path : TSV or JSON table representation path
 59 | output_path : The path where the prediction is saved
 60 | 
 61 | Returns:
 62 | json : prediction of table data extraction
 63 | """        
 64 |     client = OpenAI(api_key=  "YOUR OPENAI KEY")
 65 |     file_list = os.listdir(input_path)
 66 |     for file in file_list : 
 67 |         with open(input_path + file, 'r', encoding='utf-8') as file:
 68 |             text = file.read()
 69 |         response = client.chat.completions.create(
 70 |             model="gpt-4-1106-preview",
 71 |             temperature=0,
 72 |             frequency_penalty=0,
 73 |             presence_penalty=0,
 74 |             messages=[
 75 |             {"role": "system", "content": "I will extract the performance information of the catalyst from the table and create a JSON format. The types of performance to be extracted will be given as a list: performance_list = [overpotential, tafel_slope, Rct, stability, Cdl, onset_potential, current_density, potential, TOF, ECSA, water_splitting_potential, mass_activity, exchange_current_density, Rs, specific_activity, onset_overpotential, BET, surface_area, loading, apparent_activation_energy]. You can only use the names as they are in the performance_list, and any changes to the names in the performance_list, no matter how slight, are not allowed. The JSON format will have performance within the catalyst, and each performance will include elements present in the table: reaction type, value, electrolyte, condition, current density, versus(ex: RHE) and substrate. Other elements must not be included in performance. Be very strict. The output must contain only json dictionary. Other sentences or opinion must not be in output."},
 76 |             
 77 |             # X I/O PAIRS
 78 |             {"role": "user", "content":''},
 79 |             {"role": "assistant", "content": ''},
 80 |             
 81 |             {"role": "user", "content": text}
 82 |         ]
 83 |     )
 84 |         prediction = response.choices[0].message.content.strip()
 85 |         output_filename = output_path + '/' + name 
 86 |         try : 
 87 |             json_data = json.loads(prediction)
 88 |             with open(output_filename + '.json', 'w', encoding='utf-8-sig') as json_file:
 89 |                 json.dump(json_data, json_file, ensure_ascii = False, indent = 4)
 90 | 
 91 |         except : 
 92 |             json_data = prediction
 93 |             with open(output_filename + '.txt', "w", encoding="utf-8-sig") as txt_file:
 94 |                 txt_file.write(json_data)
 95 | 
 96 | 
 97 | def prompt(messages) : 
 98 |     response = client.chat.completions.create(
 99 |         model="gpt-4-1106-preview",
100 |         temperature=0,
101 |         frequency_penalty=0,
102 |         presence_penalty=0,
103 |         messages= messages)
104 |     
105 |     return messages, response.choices[0].message.content
106 | 
107 | 
108 | def zero_shot(input_path, output_path) : 
109 | """
110 | Test zero shot model, generates the prediction of table data extraction in json format.
111 | 
112 | Parameters:
113 | input_path : TSV or JSON table representation path
114 | output_path : The path where the prediction is saved
115 | 
116 | Returns:
117 | json : prediction of table data extraction
118 | """    
119 |     
120 |     client = OpenAI(api_key = 'YOUR OPENAI KEY')
121 |     file_list = os.listdir(input_path)
122 | 
123 |     for file in file_list :
124 |         data = {'question': [], 'answer': []}
125 |         
126 |         with open(input_path + file, 'r', encoding='utf-8') as file:
127 |             table_representer = file.read()
128 | 
129 |         table_name = file.split('.')[0]
130 | 
131 |         instruction = "I'm going to convert the information in the table representer into JSON format.\n CATALYST_TEMPLATE = {'catalyst_name' : {'performance_name' : {PROPERTY_TEMPLATE}}\n PROPERTY_TEMPLATE = {'electrolyte': '', 'reaction_type': '', 'value': '', 'current_density': '', 'overpotential': '',  'potential': '','substrate': '', 'versus':''}\n performance_list = [overpotential, tafel_slope, Rct, stability, Cdl, onset_potential, current_density, potential, TOF, ECSA, water_splitting_potential, mass_activity, exchange_current_density, Rs, specific_activity, onset_overpotential, BET, surface_area, loading, apparent_activation_energy]\n. Table representer is in below \n\n "
132 |         result = {"catalysts":[]}
133 | 
134 |         message_ = [{"role": "system", "content": instruction + table_representer}]
135 | 
136 |         catalyst_q = "Show the catalysts present in the table representer as a Python list. Answer must be ONLY python list. Not like '''python ''' Be very very very strict. Other sentences or explanation is not allowed.\n"
137 |         question = catalyst_q
138 |         message_.append({"role": "user", "content": question})
139 |         _, cata_answer = prompt(message_) 
140 |         catalyst_list = eval(cata_answer)
141 |         data['question'].append(copy(message_))
142 |         data['answer'].append(cata_answer)
143 | 
144 |         message_.append({"role": "assistant", "content": cata_answer}) # 다음 prompt에 이전 답 추가
145 | 
146 |         for catalyst in catalyst_list : 
147 | 
148 |             performance_template_q = "Create a CATALYST_TEMPLATE filling in the performance of {catalyst}  from the table representer, strictly adhering to the following 3 rules:\n\n Rule 1: Only include the actual existing performances from the Performance_list in the CATALYST_TEMPLATE.\n Rule 2: Set all values of the keys in PROPERTY_TEMPLATE to be " ". DO NOT INSERT ANY VALUE. BE VERY STRICT.\n Rule 3: Answer must be ONLY json format. Only display the JSON (like string not ```json). Other sentences or explanation is not allowed.".format(catalyst="'''"+catalyst+"'''")
149 |             question = performance_template_q
150 |             message_.append({"role": "user", "content": question})
151 |             _, perfo_answer = prompt(message_)
152 |             
153 |             data['question'].append(copy(message_)) 
154 |             data['answer'].append(perfo_answer)
155 |             
156 |             message_.append({"role": "assistant", "content": perfo_answer})
157 |             property_q = 'In PROPERTY_TEMPLATE, maintain all keys, and fill in values that exist in the table representer. If there are more than two "values" for the same performance, fill in each "value" with the property template and make it into a list. If there is unit information, never create or modify additional keys, but reflect the units in the value.'
158 |             question = property_q
159 |             message_.append({"role": "user", "content": question})
160 |             _, property_answer1 = prompt(message_)
161 | 
162 |             data['question'].append(copy(message_)) 
163 |             data['answer'].append(property_answer1)
164 |                 
165 |             message_.append({"role": "assistant", "content": property_answer1})
166 |             property_title_caption_q = "Modify the previous version of CATALYST_TEMPLATE based solely on the title, caption according to the rules below. Only refer to the title and caption part in table representer. Strictly adhere to the following rules. \n Rule 1: If there is reaction type information in title or caption, reflect the reaction type in previous version of CATALYST_TEMPLATE accordingly. But if there isn't reaction type information in title or caption part, leave CATALYST_TEMPLATE as previous version. Be strict. \n Rule 2: If there is electrolyte information in title or caption part, reflect the electrolyte in previous version of CATALYST_TEMPLATE. But if there isn't electrolyte information in title or caption part, leave CATALYST_TEMPLATE as previous version. Be strict. \n Rule 3: Never modify the keys.  \n Rule 4: Never fill in values for any other keys except reaction_type, electrolyte. Never delete any other keys or value."
167 |             question = property_title_caption_q
168 |             message_.append({"role": "user", "content": question})
169 |             _, property_answer2 = prompt(message_) 
170 |             
171 |             data['question'].append(copy(message_))
172 |             data['answer'].append(property_answer2)
173 | 
174 |             message_.append({"role": "assistant", "content": property_answer1})
175 |             delete_q ='Remove keys with no values from previous version of CATALYST_TEMPLATE.'
176 |             question = delete_q
177 |             message_.append({"role": "user", "content": question})
178 |             _, delete_answer = prompt(message_)
179 |             
180 |             data['question'].append(copy(message_))
181 |             data['answer'].append(delete_answer)
182 |             
183 |             catalyst_template = json.loads(delete_answer)
184 |             result["catalysts"].append(catalyst_template)
185 |             
186 |             message_ = [{"role": "system", "content": instruction + table_representer}]
187 |             message_.append({"role": "user", "content": catalyst_q})
188 |             message_.append({"role": "assistant", "content": cata_answer})
189 |             
190 |         if len(result["catalysts"]) == 1 : 
191 |             final_result = result["catalysts"][0]
192 | 
193 |         elif len(result["catalysts"]) > 1 : 
194 |             final_result = result
195 |         try :     
196 |             with open(output_path +  table_name + ".json", "w") as json_file:
197 |                 json.dump(final_result, json_file, indent = 4)
198 |         except : 
199 |             with open(output_path +  table_name + ".txt", "w", encoding="utf-8-sig") as txt_file:
200 |                 txt_file.write(final_result)
201 |             
202 | 
203 |             
204 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MaTableGPT: GPT-based Table Data Extractor from Materials Science Literature
 2 | 
 3 | ## Introduction
 4 | ### 1) Overall workflow of MaTableGPT
 5 | 1.  Generate customized TSV, JSON representation for HTML format of table and split the table.
 6 | 2.  Test 3 models (fine tuning, few shot, zero shot) and go through the follow up questions process.
 7 | ![1](https://github.com/KIST-CSRC/CO2RR_NER/assets/171128050/7ca70729-84cc-4b4e-a93d-225f60f424a8)
 8 | 
 9 | ### 2) GPT modeling process
10 | ![5](https://github.com/KIST-CSRC/MaTableGPT/assets/171128050/1bb4729f-bca3-4f82-9ab1-c0d93909c37a)
11 | 
12 | ## User Manual
13 | ### 1) Installation
14 | 
15 | **Using conda**
16 | ```bash
17 | conda env create -f requirements_conda.txt
18 | ```
19 | **Using pip**
20 | ```bash
21 | pip install -r requirements_paper.txt
22 | ```
23 | ### 2) Download data files
24 | ```
25 | git clone https://github.com/KIST-CSRC/MaTableGPT.git
26 | git lfs pull
27 | ```
28 | ## 3) Script architecture
29 | ```
30 | MaTableGPT
31 | ├── data
32 | │   └── non_split
33 | │   └── split
34 | │   └── pickle_folder
35 | │   └── result
36 | ├── GPT_models
37 | │   └── models.py
38 | │   └── follow_up_q.py.py
39 | ├── model_evaluation
40 | │   └── utils
41 | │   └── evaluation.py
42 | ├── table_representation
43 | │   └── table_representer.py
44 | │   └── table2json.py
45 | ├── table_splitting
46 | │   └── split_table.py
47 | │ 
48 | └── run.py
49 | ```
50 | ### 4) Code usage (run.py)
51 | **Examples : Input generation (split, tsv)**
52 | > ```python
53 | > input_guneration("split", "TSV")
54 | > ```
55 | 
56 | **Examples : Data extraction (few shot, follow_up questions)**
57 | > ```python
58 | > model_test("few_shot", True)
59 | > ```
60 | ## Benefit
61 | Using MaTableGPT, we achieved a table data extraction accuracy of 96.8% and proposed the optimal solution for each situation through the Pareto-front solution.
62 | ## Reference
63 | 
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/data/non_split/table_html/example_tbl01.html:
--------------------------------------------------------------------------------
1 | 
2 | <table class="table"><colgroup><col align="left"/><col align="left"/><col align="char" char="."/><col align="char" char="."/><col align="char" char="."/><col align="center"/></colgroup><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">Co-BPDC/Co-BDC heterojunction</td><td align="left" class="colsep0 rowsep0">1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">335</td><td align="char" char="." class="colsep0 rowsep0">72.1</td><td align="char" char="." class="colsep0 rowsep0"><b>80</b></td><td align="center" class="colsep0 rowsep0">this work</td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">Co-BDC</td><td align="left" class="colsep0 rowsep0">1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">392</td><td align="char" char="." class="colsep0 rowsep0">77.2</td><td align="char" char="." class="colsep0 rowsep0">28</td><td align="center" class="colsep0 rowsep0">this work</td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">Co-BPDC</td><td align="left" class="colsep0 rowsep0">1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">428</td><td align="char" char="." class="colsep0 rowsep0">78.8</td><td align="char" char="." class="colsep0 rowsep0">28</td><td align="center" class="colsep0 rowsep0">this work</td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">Co/MIL-101(Cr)-O</td><td align="left" class="colsep0 rowsep0">0.1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">570</td><td align="char" char="." class="colsep0 rowsep0">17</td><td align="center" char="." class="colsep0 rowsep0">–</td><td align="center" class="colsep0 rowsep0"> <a class="ref ref40" href="javascript:void(0);" onclick="showRef(event, 'ref40'); return false;">(40)</a></td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">Fe<sub>2</sub>Ni-BPTC/CC</td><td align="left" class="colsep0 rowsep0">0.1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">365</td><td align="char" char="." class="colsep0 rowsep0">77.2</td><td align="char" char="." class="colsep0 rowsep0">15</td><td align="center" class="colsep0 rowsep0"> <a class="ref ref41" href="javascript:void(0);" onclick="showRef(event, 'ref41'); return false;">(41)</a></td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">UTSA-16</td><td align="left" class="colsep0 rowsep0">1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">408</td><td align="char" char="." class="colsep0 rowsep0">77</td><td align="char" char="." class="colsep0 rowsep0">7</td><td align="center" class="colsep0 rowsep0"> <a class="ref ref42" href="javascript:void(0);" onclick="showRef(event, 'ref42'); return false;">(42)</a></td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">2D Co–MOF UNS</td><td align="left" class="colsep0 rowsep0">1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">263</td><td align="char" char="." class="colsep0 rowsep0">74</td><td align="char" char="." class="colsep0 rowsep0">3.3</td><td align="center" class="colsep0 rowsep0"> <a class="ref ref18" href="javascript:void(0);" onclick="showRef(event, 'ref18'); return false;">(18)</a></td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">Co-OBA/C</td><td align="left" class="colsep0 rowsep0">0.1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">590</td><td align="char" char="." class="colsep0 rowsep0">85.7</td><td align="center" char="." class="colsep0 rowsep0">–</td><td align="center" class="colsep0 rowsep0"> <a class="ref ref43" href="javascript:void(0);" onclick="showRef(event, 'ref43'); return false;">(43)</a></td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">Co<sub>2</sub>(μ–OH)<sub>2</sub>(bbta)</td><td align="left" class="colsep0 rowsep0">1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">387</td><td align="char" char="." class="colsep0 rowsep0">60</td><td align="char" char="." class="colsep0 rowsep0">24</td><td align="center" class="colsep0 rowsep0"> <a class="ref ref44" href="javascript:void(0);" onclick="showRef(event, 'ref44'); return false;">(44)</a></td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">3D Gr/Ni-MOF</td><td align="left" class="colsep0 rowsep0">0.1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">370</td><td align="char" char="." class="colsep0 rowsep0">91</td><td align="char" char="." class="colsep0 rowsep0">20</td><td align="center" class="colsep0 rowsep0"> <a class="ref ref25" href="javascript:void(0);" onclick="showRef(event, 'ref25'); return false;">(25)</a></td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">Co<sub>0.6</sub>Fe<sub>0.4</sub>-MOF-74</td><td align="left" class="colsep0 rowsep0">1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">280</td><td align="char" char="." class="colsep0 rowsep0">56</td><td align="char" char="." class="colsep0 rowsep0">12</td><td align="center" class="colsep0 rowsep0"> <a class="ref ref3" href="javascript:void(0);" onclick="showRef(event, 'ref3'); return false;">(3)</a></td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">Ti<sub>3</sub>C<sub>2</sub>T<sub><i>x</i></sub>-CoBDC</td><td align="left" class="colsep0 rowsep0">0.1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">410</td><td align="char" char="." class="colsep0 rowsep0">48.2</td><td align="char" char="." class="colsep0 rowsep0">2.8</td><td align="center" class="colsep0 rowsep0"> <a class="ref ref23" href="javascript:void(0);" onclick="showRef(event, 'ref23'); return false;">(23)</a></td></tr><tr class="colsep0" valign="top"><td align="left" class="colsep0 rowsep0">Co-MOF</td><td align="left" class="colsep0 rowsep0">1 M KOH</td><td align="char" char="." class="colsep0 rowsep0">360</td><td align="char" char="." class="colsep0 rowsep0">89</td><td align="center" char="." class="colsep0 rowsep0">–</td><td align="center" class="colsep0 rowsep0"> <a class="ref ref45" href="javascript:void(0);" onclick="showRef(event, 'ref45'); return false;">(45)</a></td></tr></tbody></table>"
3 | 


--------------------------------------------------------------------------------
/data/non_split/table_json/example_tbl01.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><colgroup><col align=\"left\"/><col align=\"left\"/><col align=\"char\" char=\".\"/><col align=\"char\" char=\".\"/><col align=\"char\" char=\".\"/><col align=\"center\"/></colgroup><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">Co-BPDC/Co-BDC heterojunction</td><td align=\"left\" class=\"colsep0 rowsep0\">1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">335</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">72.1</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\"><b>80</b></td><td align=\"center\" class=\"colsep0 rowsep0\">this work</td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">Co-BDC</td><td align=\"left\" class=\"colsep0 rowsep0\">1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">392</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">77.2</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">28</td><td align=\"center\" class=\"colsep0 rowsep0\">this work</td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">Co-BPDC</td><td align=\"left\" class=\"colsep0 rowsep0\">1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">428</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">78.8</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">28</td><td align=\"center\" class=\"colsep0 rowsep0\">this work</td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">Co/MIL-101(Cr)-O</td><td align=\"left\" class=\"colsep0 rowsep0\">0.1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">570</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">17</td><td align=\"center\" char=\".\" class=\"colsep0 rowsep0\">–</td><td align=\"center\" class=\"colsep0 rowsep0\"> <a class=\"ref ref40\" href=\"javascript:void(0);\" onclick=\"showRef(event, 'ref40'); return false;\">(40)</a></td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">Fe<sub>2</sub>Ni-BPTC/CC</td><td align=\"left\" class=\"colsep0 rowsep0\">0.1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">365</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">77.2</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">15</td><td align=\"center\" class=\"colsep0 rowsep0\"> <a class=\"ref ref41\" href=\"javascript:void(0);\" onclick=\"showRef(event, 'ref41'); return false;\">(41)</a></td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">UTSA-16</td><td align=\"left\" class=\"colsep0 rowsep0\">1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">408</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">77</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">7</td><td align=\"center\" class=\"colsep0 rowsep0\"> <a class=\"ref ref42\" href=\"javascript:void(0);\" onclick=\"showRef(event, 'ref42'); return false;\">(42)</a></td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">2D Co–MOF UNS</td><td align=\"left\" class=\"colsep0 rowsep0\">1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">263</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">74</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">3.3</td><td align=\"center\" class=\"colsep0 rowsep0\"> <a class=\"ref ref18\" href=\"javascript:void(0);\" onclick=\"showRef(event, 'ref18'); return false;\">(18)</a></td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">Co-OBA/C</td><td align=\"left\" class=\"colsep0 rowsep0\">0.1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">590</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">85.7</td><td align=\"center\" char=\".\" class=\"colsep0 rowsep0\">–</td><td align=\"center\" class=\"colsep0 rowsep0\"> <a class=\"ref ref43\" href=\"javascript:void(0);\" onclick=\"showRef(event, 'ref43'); return false;\">(43)</a></td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">Co<sub>2</sub>(μ–OH)<sub>2</sub>(bbta)</td><td align=\"left\" class=\"colsep0 rowsep0\">1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">387</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">60</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">24</td><td align=\"center\" class=\"colsep0 rowsep0\"> <a class=\"ref ref44\" href=\"javascript:void(0);\" onclick=\"showRef(event, 'ref44'); return false;\">(44)</a></td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">3D Gr/Ni-MOF</td><td align=\"left\" class=\"colsep0 rowsep0\">0.1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">370</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">91</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">20</td><td align=\"center\" class=\"colsep0 rowsep0\"> <a class=\"ref ref25\" href=\"javascript:void(0);\" onclick=\"showRef(event, 'ref25'); return false;\">(25)</a></td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">Co<sub>0.6</sub>Fe<sub>0.4</sub>-MOF-74</td><td align=\"left\" class=\"colsep0 rowsep0\">1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">280</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">56</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">12</td><td align=\"center\" class=\"colsep0 rowsep0\"> <a class=\"ref ref3\" href=\"javascript:void(0);\" onclick=\"showRef(event, 'ref3'); return false;\">(3)</a></td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">Ti<sub>3</sub>C<sub>2</sub>T<sub><i>x</i></sub>-CoBDC</td><td align=\"left\" class=\"colsep0 rowsep0\">0.1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">410</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">48.2</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">2.8</td><td align=\"center\" class=\"colsep0 rowsep0\"> <a class=\"ref ref23\" href=\"javascript:void(0);\" onclick=\"showRef(event, 'ref23'); return false;\">(23)</a></td></tr><tr class=\"colsep0\" valign=\"top\"><td align=\"left\" class=\"colsep0 rowsep0\">Co-MOF</td><td align=\"left\" class=\"colsep0 rowsep0\">1 M KOH</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">360</td><td align=\"char\" char=\".\" class=\"colsep0 rowsep0\">89</td><td align=\"center\" char=\".\" class=\"colsep0 rowsep0\">–</td><td align=\"center\" class=\"colsep0 rowsep0\"> <a class=\"ref ref45\" href=\"javascript:void(0);\" onclick=\"showRef(event, 'ref45'); return false;\">(45)</a></td></tr></tbody></table>"
5 | }


--------------------------------------------------------------------------------
/data/pickle_folder/example_tbl01.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/data/pickle_folder/example_tbl01.pickle


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "Co-BPDC/Co-BDC heterojunction"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         335
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         72.1
14 |     ],
15 |     "durability (h)": [
16 |         80
17 |     ],
18 |     "ref": [
19 |         "this work"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_02.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "Co-BDC"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         392
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         77.2
14 |     ],
15 |     "durability (h)": [
16 |         28
17 |     ],
18 |     "ref": [
19 |         "this work"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_03.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "Co-BPDC"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         428
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         78.8
14 |     ],
15 |     "durability (h)": [
16 |         28
17 |     ],
18 |     "ref": [
19 |         "this work"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_04.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "Co/MIL-101(Cr)-O"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "0.1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         570
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         17
14 |     ],
15 |     "durability (h)": [
16 |         "\u2013"
17 |     ],
18 |     "ref": [
19 |         "(40)"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_05.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "Fe<sub>2</sub>Ni-BPTC/CC"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "0.1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         365
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         77.2
14 |     ],
15 |     "durability (h)": [
16 |         15
17 |     ],
18 |     "ref": [
19 |         "(41)"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_06.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "UTSA-16"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         408
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         77
14 |     ],
15 |     "durability (h)": [
16 |         7
17 |     ],
18 |     "ref": [
19 |         "(42)"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_07.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "2D Co\u2013MOF UNS"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         263
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         74
14 |     ],
15 |     "durability (h)": [
16 |         3.3
17 |     ],
18 |     "ref": [
19 |         "(18)"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_08.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "Co-OBA/C"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "0.1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         590
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         85.7
14 |     ],
15 |     "durability (h)": [
16 |         "\u2013"
17 |     ],
18 |     "ref": [
19 |         "(43)"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_09.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "Co<sub>2</sub>(\u03bc\u2013OH)<sub>2</sub>(bbta)"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         387
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         60
14 |     ],
15 |     "durability (h)": [
16 |         24
17 |     ],
18 |     "ref": [
19 |         "(44)"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_10.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "3D Gr/Ni-MOF"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "0.1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         370
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         91
14 |     ],
15 |     "durability (h)": [
16 |         20
17 |     ],
18 |     "ref": [
19 |         "(25)"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_11.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "Co<sub>0.6</sub>Fe<sub>0.4</sub>-MOF-74"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         280
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         56
14 |     ],
15 |     "durability (h)": [
16 |         12
17 |     ],
18 |     "ref": [
19 |         "(3)"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_12.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "Ti<sub>3</sub>C<sub>2</sub>T<sub>x</sub>-CoBDC"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "0.1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         410
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         48.2
14 |     ],
15 |     "durability (h)": [
16 |         2.8
17 |     ],
18 |     "ref": [
19 |         "(23)"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/json_representation/example_tbl01_13.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
 3 |     "catalyst": [
 4 |         "Co-MOF"
 5 |     ],
 6 |     "electrolyte": [
 7 |         "1 M KOH"
 8 |     ],
 9 |     "\u03b7 at 10 mA\u00a0cm<sup>\u20132</sup> (mV)": [
10 |         360
11 |     ],
12 |     "Tafel slope (mV/decade)": [
13 |         89
14 |     ],
15 |     "durability (h)": [
16 |         "\u2013"
17 |     ],
18 |     "ref": [
19 |         "(45)"
20 |     ],
21 |     "caption": ""
22 | }


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_01.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>Co-BPDC/Co-BDC heterojunction</td> <td>1 M KOH</td> <td>335</td> <td>72.1</td> <td>80</td> <td>this work</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_02.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>Co-BDC</td> <td>1 M KOH</td> <td>392</td> <td>77.2</td> <td>28</td> <td>this work</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_03.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>Co-BPDC</td> <td>1 M KOH</td> <td>428</td> <td>78.8</td> <td>28</td> <td>this work</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_04.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>Co/MIL-101(Cr)-O</td> <td>0.1 M KOH</td> <td>570</td> <td>17</td> <td>–</td> <td> (40)</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_05.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>Fe<sub>2</sub>Ni-BPTC/CC</td> <td>0.1 M KOH</td> <td>365</td> <td>77.2</td> <td>15</td> <td> (41)</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_06.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>UTSA-16</td> <td>1 M KOH</td> <td>408</td> <td>77</td> <td>7</td> <td> (42)</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_07.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>2D Co–MOF UNS</td> <td>1 M KOH</td> <td>263</td> <td>74</td> <td>3.3</td> <td> (18)</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_08.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>Co-OBA/C</td> <td>0.1 M KOH</td> <td>590</td> <td>85.7</td> <td>–</td> <td> (43)</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_09.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>Co<sub>2</sub>(μ–OH)<sub>2</sub>(bbta)</td> <td>1 M KOH</td> <td>387</td> <td>60</td> <td>24</td> <td> (44)</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_10.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>3D Gr/Ni-MOF</td> <td>0.1 M KOH</td> <td>370</td> <td>91</td> <td>20</td> <td> (25)</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_11.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>Co<sub>0.6</sub>Fe<sub>0.4</sub>-MOF-74</td> <td>1 M KOH</td> <td>280</td> <td>56</td> <td>12</td> <td> (3)</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_12.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>Ti<sub>3</sub>C<sub>2</sub>T<sub>x</sub>-CoBDC</td> <td>0.1 M KOH</td> <td>410</td> <td>48.2</td> <td>2.8</td> <td> (23)</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_html/example_tbl01_13.html:
--------------------------------------------------------------------------------
1 | <table class='table'><table><thead><tr class="colsep0" valign="top"><th align="center" class="colsep0 rowsep0">catalyst</th><th align="center" class="colsep0 rowsep0">electrolyte</th><th align="center" char="." class="colsep0 rowsep0">η at 10 mA cm<sup>–2</sup> (mV)</th><th align="center" char="." class="colsep0 rowsep0">Tafel slope (mV/decade)</th><th align="center" char="." class="colsep0 rowsep0">durability (h)</th><th align="center" class="colsep0 rowsep0">ref</th></tr></thead><tbody> <tr> <td>Co-MOF</td> <td>1 M KOH</td> <td>360</td> <td>89</td> <td>–</td> <td> (45)</td> </tr> </tbody></table>


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_01.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>Co-BPDC/Co-BDC heterojunction</td> <td>1 M KOH</td> <td>335</td> <td>72.1</td> <td>80</td> <td>this work</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_02.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>Co-BDC</td> <td>1 M KOH</td> <td>392</td> <td>77.2</td> <td>28</td> <td>this work</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_03.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>Co-BPDC</td> <td>1 M KOH</td> <td>428</td> <td>78.8</td> <td>28</td> <td>this work</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_04.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>Co/MIL-101(Cr)-O</td> <td>0.1 M KOH</td> <td>570</td> <td>17</td> <td>–</td> <td> (40)</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_05.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>Fe<sub>2</sub>Ni-BPTC/CC</td> <td>0.1 M KOH</td> <td>365</td> <td>77.2</td> <td>15</td> <td> (41)</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_06.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>UTSA-16</td> <td>1 M KOH</td> <td>408</td> <td>77</td> <td>7</td> <td> (42)</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_07.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>2D Co–MOF UNS</td> <td>1 M KOH</td> <td>263</td> <td>74</td> <td>3.3</td> <td> (18)</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_08.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>Co-OBA/C</td> <td>0.1 M KOH</td> <td>590</td> <td>85.7</td> <td>–</td> <td> (43)</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_09.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>Co<sub>2</sub>(μ–OH)<sub>2</sub>(bbta)</td> <td>1 M KOH</td> <td>387</td> <td>60</td> <td>24</td> <td> (44)</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_10.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>3D Gr/Ni-MOF</td> <td>0.1 M KOH</td> <td>370</td> <td>91</td> <td>20</td> <td> (25)</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_11.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>Co<sub>0.6</sub>Fe<sub>0.4</sub>-MOF-74</td> <td>1 M KOH</td> <td>280</td> <td>56</td> <td>12</td> <td> (3)</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_12.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>Ti<sub>3</sub>C<sub>2</sub>T<sub>x</sub>-CoBDC</td> <td>0.1 M KOH</td> <td>410</td> <td>48.2</td> <td>2.8</td> <td> (23)</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/table_split_json/example_tbl01_13.json:
--------------------------------------------------------------------------------
1 | {
2 |     "title": "Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts",
3 |     "caption": "",
4 |     "tag": "<table class=\"table\"><table><thead><tr class=\"colsep0\" valign=\"top\"><th align=\"center\" class=\"colsep0 rowsep0\">catalyst</th><th align=\"center\" class=\"colsep0 rowsep0\">electrolyte</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">η at 10 mA cm<sup>–2</sup> (mV)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">Tafel slope (mV/decade)</th><th align=\"center\" char=\".\" class=\"colsep0 rowsep0\">durability (h)</th><th align=\"center\" class=\"colsep0 rowsep0\">ref</th></tr></thead><tbody> <tr> <td>Co-MOF</td> <td>1 M KOH</td> <td>360</td> <td>89</td> <td>–</td> <td> (45)</td> </tr> </tbody></table></table>"
5 | }


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_01.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo-BPDC/Co-BDC heterojunction\t1 M KOH\t335\t72.1\t80\tthis work\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_02.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo-BDC\t1 M KOH\t392\t77.2\t28\tthis work\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_03.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo-BPDC\t1 M KOH\t428\t78.8\t28\tthis work\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_04.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo/MIL-101(Cr)-O\t0.1 M KOH\t570\t17\t–\t (40)\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_05.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nFe<sub>2</sub>Ni-BPTC/CC\t0.1 M KOH\t365\t77.2\t15\t (41)\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_06.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nUTSA-16\t1 M KOH\t408\t77\t7\t (42)\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_07.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\n2D Co–MOF UNS\t1 M KOH\t263\t74\t3.3\t (18)\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_08.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo-OBA/C\t0.1 M KOH\t590\t85.7\t–\t (43)\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_09.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo<sub>2</sub>(μ–OH)<sub>2</sub>(bbta)\t1 M KOH\t387\t60\t24\t (44)\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_10.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\n3D Gr/Ni-MOF\t0.1 M KOH\t370\t91\t20\t (25)\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_11.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo<sub>0.6</sub>Fe<sub>0.4</sub>-MOF-74\t1 M KOH\t280\t56\t12\t (3)\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_12.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nTi<sub>3</sub>C<sub>2</sub>T<sub>x</sub>-CoBDC\t0.1 M KOH\t410\t48.2\t2.8\t (23)\t\n</table>


--------------------------------------------------------------------------------
/data/split/tsv_representation/example_tbl01_13.txt:
--------------------------------------------------------------------------------
1 | ﻿<title>Table 1. Comparison of OER Performance for Several MOFs Electrocatalysts</title><table>catalyst\telectrolyte\tη at 10 mA cm<sup>–2</sup> (mV)\tTafel slope (mV/decade)\tdurability (h)\tref\t\nCo-MOF\t1 M KOH\t360\t89\t–\t (45)\t\n</table>


--------------------------------------------------------------------------------
/input_generation_script.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "input_generator": {
 3 |         "splitting_HTML": {
 4 |             "split": [
 5 |                 {
 6 |                     "input_JSON_path": "./data/table_json",
 7 |                     "input_pickle_path": "./data/pickle_folder", 
 8 |                     "input_HTML_path": "./data/table_html",
 9 |                     "output_HTML_path": "./data/html_table_split_result"
10 |                 },
11 |                 {
12 |                     "table_representation": {
13 |                         "JSON": {
14 |                             "input_path": "./data/split/table_split_json/",
15 |                             "output_path": "./data/split/json_representation"
16 |                         },
17 |                         "TSV": {
18 |                             "input_path": "./data/split/table_split_json/",
19 |                             "output_path": "./data/split/tsv_representation"
20 |                         }
21 |                     }
22 |                 }
23 |             ],
24 |             "non_split": {
25 |                 "table_representation": {
26 |                     "JSON": {
27 |                         "input_path": "./data/non_split/table_json/",
28 |                         "output_path": "./data/non_split/json_representation"
29 |                     },
30 |                     "TSV": {
31 |                         "input_path": "./data/non_split/table_json/",
32 |                         "output_path": "./data/non_split/tsv_representation"
33 |                     }
34 |                 }
35 |             }
36 |         }
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/model_evaluation/evaluation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import random
  4 | import re
  5 | import unicodedata
  6 | from utils.functions import *
  7 | 
  8 | class evaluation : 
  9 |     def __init__(self, prediction_path, groundTruth_path) :
 10 |         self.prediction_path = prediction_path
 11 |         self.groundTruth_path = groundTruth_path
 12 |         
 13 |         
 14 |     def remove_whitespace_from_keys(self, data):
 15 |         '''
 16 |         remove whitespace from keys
 17 |         '''
 18 |         if isinstance(data, dict):
 19 |             data = {key.replace("_", ""): self.remove_whitespace_from_keys(value) for key, value in data.items()}
 20 |         if isinstance(data, dict):
 21 |             return {key.replace(" ", ""): self.remove_whitespace_from_keys(value) for key, value in data.items()}
 22 |         elif isinstance(data, list):
 23 |             return [self.remove_whitespace_from_keys(item) for item in data]
 24 |         else:
 25 |             return data
 26 |         
 27 |     def remove_unicode(self, text):
 28 |         '''
 29 |         remove unicode
 30 |         '''
 31 |     
 32 |         return ''.join(char for char in unicodedata.normalize('NFKD', text) if not unicodedata.combining(char))
 33 | 
 34 | 
 35 |     def remove_unicode_version(self,data) : 
 36 |         processed_data = {}
 37 |         for key, value in data.items():
 38 |             key = self.remove_unicode(key)
 39 |             if isinstance(value, str):
 40 |                 processed_data[key] = self.remove_unicode(value)
 41 |             else:
 42 |                 processed_data[key] = value
 43 | 
 44 |         return processed_data
 45 |     
 46 |     def unicode_to_str(self, match):
 47 |         unicode_str = match.group()
 48 |         return bytes(unicode_str, 'utf-8').decode('unicode_escape') 
 49 |     
 50 |     def load_data(self) :
 51 |         '''
 52 |         load prediction and ground truth
 53 |         '''
 54 |         with open(self.prediction_path, 'r', encoding='utf-8-sig') as file:
 55 |             prediction = json.load(file) 
 56 |         
 57 |         with open(self.groundTruth_path, 'r', encoding='utf-8-sig') as file:
 58 |             ground_truth = json.load(file)  
 59 | 
 60 |         prediction = json.dumps(prediction, ensure_ascii=False)
 61 | 
 62 |         cleaned_string = re.sub(r'–', '-', prediction)
 63 |         cleaned_string = re.sub(r'−', '-', cleaned_string)
 64 |         cleaned_string = re.sub(r'<sup>', '', cleaned_string)
 65 |         cleaned_string = re.sub(r'</sup>', '', cleaned_string)
 66 |         cleaned_string = re.sub(r'\\u[0-9a-fA-F]{4}', self.unicode_to_str, cleaned_string)
 67 |         cleaned_string = re.sub(r'�', '±', cleaned_string)
 68 |         cleaned_string = re.sub(r'm2 g−1', 'm2/g', cleaned_string)
 69 |         cleaned_string = re.sub(r'μF cm−2', 'μF/cm2', cleaned_string)
 70 |         cleaned_string = re.sub(r'mA mg−2', 'mA/mg2', cleaned_string)
 71 |         cleaned_string = re.sub(r'mA mg−1', 'mA/mg', cleaned_string)
 72 |         cleaned_string = re.sub(r'mA cm−2', 'mA/cm2', cleaned_string)
 73 |         cleaned_string = re.sub(r'ohm', 'ω', cleaned_string)
 74 |         cleaned_string = re.sub(r'~', '∼', cleaned_string)
 75 |         cleaned_string = re.sub(r'</potential>', '', cleaned_string)
 76 |         cleaned_string = re.sub(r'²', '2', cleaned_string)
 77 |         cleaned_string = re.sub(r'\u2005', ' ', cleaned_string)
 78 |         cleaned_string = re.sub(r'\u2006', ' ', cleaned_string)
 79 |         cleaned_string = re.sub(r'\u2009', ' ', cleaned_string)
 80 |         cleaned_string = re.sub(r'\u200b', ' ', cleaned_string)
 81 |         cleaned_string = re.sub(r'\u202f', ' ', cleaned_string)
 82 |         cleaned_string = re.sub(r'\u200e', ' ', cleaned_string)
 83 |         cleaned_string = re.sub(r'0\s+0', '0,0', cleaned_string)
 84 |         cleaned_string = re.sub(r'(\d+)∼(\d+)', r'\1-\2', cleaned_string)
 85 |         cleaned_string = re.sub(r' ', '', cleaned_string)
 86 |         cleaned_string = re.sub(r'·', '', cleaned_string)
 87 |         cleaned_string = re.sub(r'fcm−2', 'f/cm2', cleaned_string)
 88 |         cleaned_string = cleaned_string.lower()
 89 |         cleaned_string = re.sub(r'jecsa', 'ecsa', cleaned_string)
 90 |         cleaned_string = re.sub(r'j-ecsa', 'ecsa', cleaned_string)
 91 |         cleaned_string = re.sub(r'ag-1', 'a/g', cleaned_string)
 92 |         cleaned_string = re.sub(r'ours', 'thiswork', cleaned_string)
 93 |         cleaned_string = re.sub(r'0\.m', '0m', cleaned_string)
 94 |         cleaned_string = re.sub(r';', '', cleaned_string)
 95 |         cleaned_string = re.sub(r'\.$', '', cleaned_string)
 96 |         prediction = json.loads(cleaned_string)
 97 |         
 98 |         ground_truth = json.dumps(ground_truth, ensure_ascii=False)
 99 |         cleaned_string = re.sub(r'–', '-', ground_truth)
100 |         cleaned_string = re.sub(r'−', '-', cleaned_string)
101 |         cleaned_string = re.sub(r'<sup>', '', cleaned_string)
102 |         cleaned_string = re.sub(r'</sup>', '', cleaned_string)
103 |         cleaned_string = re.sub(r'\\u[0-9a-fA-F]{4}', self.unicode_to_str, cleaned_string)
104 |         cleaned_string = re.sub(r'�', '±', cleaned_string)
105 |         cleaned_string = re.sub(r'm2 g−1', 'm2/g', cleaned_string)
106 |         cleaned_string = re.sub(r'μF cm−2', 'μF/cm2', cleaned_string)
107 |         cleaned_string = re.sub(r'mA mg−2', 'mA/mg2', cleaned_string)
108 |         cleaned_string = re.sub(r'mA mg−1', 'mA/mg', cleaned_string)
109 |         cleaned_string = re.sub(r'mA cm−2', 'mA/cm2', cleaned_string)
110 |         cleaned_string = re.sub(r'ohm', 'ω', cleaned_string)
111 |         cleaned_string = re.sub(r'~', '∼', cleaned_string)
112 |         cleaned_string = re.sub(r'²', '2', cleaned_string)
113 |         cleaned_string = re.sub(r'\u2005', ' ', cleaned_string)
114 |         cleaned_string = re.sub(r'\u2006', ' ', cleaned_string)
115 |         cleaned_string = re.sub(r'\u2009', ' ', cleaned_string)
116 |         cleaned_string = re.sub(r'\u200b', ' ', cleaned_string)
117 |         cleaned_string = re.sub(r'\u202f', ' ', cleaned_string)
118 |         cleaned_string = re.sub(r'\u200e', ' ', cleaned_string)
119 |         cleaned_string = re.sub(r'0\s+0', '0,0', cleaned_string)
120 |         cleaned_string = re.sub(r'(\d+)∼(\d+)', r'\1-\2', cleaned_string)
121 |         cleaned_string = re.sub(r' ', '', cleaned_string)
122 |         cleaned_string = re.sub(r'·', '', cleaned_string)
123 |         cleaned_string = re.sub(r'fcm−2', 'f/cm2', cleaned_string)
124 |         cleaned_string = cleaned_string.lower()
125 |         cleaned_string = re.sub(r'jecsa', 'ecsa', cleaned_string)
126 |         cleaned_string = re.sub(r'j-ecsa', 'ecsa', cleaned_string)
127 |         cleaned_string = re.sub(r'ag-1', 'a/g', cleaned_string)
128 |         cleaned_string = re.sub(r'ours', 'thiswork', cleaned_string)
129 |         cleaned_string = re.sub(r'0\.m', '0m', cleaned_string)
130 |         cleaned_string = re.sub(r';', '', cleaned_string)
131 |         cleaned_string = re.sub(r'\.$', '', cleaned_string)
132 | 
133 |         ground_truth = json.loads(cleaned_string)
134 |         prediction = self.remove_unicode_version(prediction)
135 |         ground_truth = self.remove_unicode_version(ground_truth)
136 |         
137 |         prediction = self.remove_whitespace_from_keys(prediction)
138 |         ground_truth = self.remove_whitespace_from_keys(ground_truth)
139 | 
140 |         return prediction, ground_truth
141 |     
142 |     def get_key_list_with_value(self) :
143 |         '''
144 |         return list that contain key, value sets
145 |         '''
146 |         prediction, ground_truth = self.load_data()
147 |         pr_list = get_keys(prediction, parent_key = '', sep = '//') 
148 |         gt_list = get_keys(ground_truth, parent_key = '', sep = '//')
149 | 
150 |         return pr_list, gt_list
151 |     
152 |     def merging(self) : 
153 |         '''
154 |         combine dict that have same catalyst name
155 |         '''
156 |         pr_list, gt_list = self.get_key_list_with_value()
157 |         prediction, ground_truth = self.load_data()
158 |         first_key_value_p = prediction[next(iter(prediction))]  
159 |         
160 |         if isinstance(first_key_value_p, list):         
161 | 
162 |             first_dict_p = first_key_value_p[0]
163 |             first_value_p = self.first_key(first_dict_p)
164 | 
165 |             if first_value_p != { } : 
166 |                 try :      
167 |                     if dupl_catalyst(pr_list) : 
168 |                         prediction_ = merging_result(first_key_value_p, pr_list)
169 |                         prediction  = {}
170 |                         prediction["catalysts"] = prediction_
171 |                 except : 
172 |                     prediction = prediction
173 |             else : 
174 |                 prediction = prediction
175 |         else : 
176 |             prediction = prediction
177 | 
178 |         first_key_value_g = ground_truth[next(iter(ground_truth))] 
179 |         
180 |         if isinstance(first_key_value_g, list):       
181 | 
182 |             first_dict_g = first_key_value_g[0]
183 |             first_value_g = self.first_key(first_dict_g)
184 |             if first_value_g != { } : 
185 | 
186 |                 try :      
187 |                     if dupl_catalyst(gt_list) : 
188 | 
189 |                         ground_truth_ = merging_result(first_key_value_g, gt_list)
190 |                         ground_truth  = {}
191 |                         ground_truth["catalysts"] = ground_truth_
192 | 
193 | 
194 |                 except : 
195 |                     ground_truth = ground_truth
196 |             else : 
197 |                 ground_truth = ground_truth
198 |         else : 
199 |             ground_truth = ground_truth
200 | 
201 |         return prediction, ground_truth
202 |     
203 |     
204 |     def first_key(self, dict) : 
205 |         '''
206 |         finding first key of dict
207 |         '''
208 |         first_key = next(iter(dict))
209 | 
210 |         first_value = dict[first_key]
211 |         return first_value
212 |     
213 |     def get_key_list_with_value_for_structure(self) :
214 |         '''
215 |         combine dict that have same catalyst name
216 |         '''
217 |         prediction, ground_truth = self.merging()
218 |         
219 |         pr_list = get_keys(prediction, parent_key = '', sep = '//') 
220 |         gt_list = get_keys(ground_truth, parent_key = '', sep = '//')
221 | 
222 |         return pr_list, gt_list 
223 |     
224 |     def run(self) : 
225 |         '''
226 |         calculate structure F1 score
227 |         '''
228 |         TP = [] 
229 |         FP = [] 
230 |         FN = [] 
231 |         corrected = []
232 |         incorrected = []
233 |         
234 |         pr_list, gt_list= self.get_key_list_with_value_for_structure()
235 | 
236 |         pr_list = [item for item in pr_list if 'condition' not in item]   
237 |         gt_list = [item for item in gt_list if 'condition' not in item]  
238 |         
239 |         structure_pr = []
240 |         structure_gt = []
241 |         for i in pr_list : 
242 |             if '****' in i : 
243 |                 structure_pr.append(i.split("****")[0])
244 |             else : 
245 |                 if i != '' :
246 |                     structure_pr.append(i) 
247 | 
248 |         for i in gt_list : 
249 |             if '****' in i : 
250 |                 structure_gt.append(i.split("****")[0])
251 |             else : 
252 |                 if i != '' :
253 |                     structure_gt.append(i) 
254 |         
255 |         
256 |         f1_pr =  add_indices_to_duplicates(structure_pr)        
257 |         f1_gt =  add_indices_to_duplicates(structure_gt)          
258 | 
259 |         for vv in f1_pr : 
260 |             if vv in f1_gt : 
261 |                 TP.append(vv)
262 |         
263 |             if vv not in f1_gt : 
264 |                 FP.append(vv)
265 |         
266 |         for pp in f1_gt : 
267 |             if pp not in f1_pr : 
268 |                 FN.append(pp)
269 |         f1_score_l = []
270 | 
271 |         f1_score = (len(TP) / (len(TP) + (1/2)*(len(FP) + len(FN))))
272 | 
273 |         print(self.prediction_path.split('/')[-1])
274 |         print(f1_score)
275 |         return f1_score, len(TP)
276 |     
277 |     def run2(self) : 
278 |         '''
279 |         calculate value accuracy
280 |         '''
281 |         print(self.groundTruth_path)
282 | 
283 |         corrected = []
284 |         incorrected = []
285 |         prediction, ground_truth = self.merging()
286 | 
287 |         pr_list = get_keys_for_value_accuracy(prediction)
288 |         gt_list = get_keys_for_value_accuracy(ground_truth)
289 | 
290 |         pr_list = [item for item in pr_list if 'condition' not in item]   
291 |         gt_list = [item for item in gt_list if 'condition' not in item]  
292 |         
293 |         pr_value = []
294 |         gt_value = []
295 | 
296 |         for i in pr_list : 
297 |             if '****' in i : 
298 |                 pr_value.append(i)
299 |         for j in gt_list : 
300 |             if '****' in j : 
301 |                 gt_value.append(j)
302 |                 
303 |         sep_pr_list = seperate_key_value(pr_value)
304 |         sep_gt_list = seperate_key_value(gt_value)
305 | 
306 |         str_val_valset_pr = str_val_valset_split(sep_pr_list)
307 |         str_val_valset_gt = str_val_valset_split(sep_gt_list)
308 | 
309 |         compare_gt = group_by_first_element(str_val_valset_gt)
310 |         compare_pr = group_by_first_element(str_val_valset_pr)
311 | 
312 |         total = 0
313 |         if not compare_gt and not compare_pr : 
314 | 
315 |             corrected = [1]
316 |             total = 1
317 |             
318 |             
319 |         elif not compare_gt and compare_pr : 
320 |             right = len(gt_value) - len(pr_value)
321 |             worng = len(pr_value)
322 |             for r in range(0, right) : 
323 |                 corrected.append(r)
324 |             for w in range(0, wrong) : 
325 |                 incorrected.append(w)
326 |             
327 |         elif compare_gt and compare_pr : 
328 |             for c_pr in compare_pr : 
329 |                 for c_gt in compare_gt : 
330 | 
331 |                     if c_gt[0][0] in flatten_list(c_pr):
332 | 
333 |                         if len(c_gt) == len(c_pr) : 
334 |                             total = total + len(c_gt)
335 |                         elif len(c_gt) < len(c_pr) : 
336 |                             total = total + len(c_gt)
337 |                         elif len(c_gt) > len(c_pr) : 
338 |                             total = total + len(c_pr)
339 | 
340 |                         if len(c_gt) == len(c_pr) and len(c_gt) == 1 : 
341 |                             if c_gt[0][1] == c_pr[0][1] :
342 |                                 corrected.append(c_gt)
343 |                             else : 
344 |                                 incorrected.append(c_gt)
345 |                         else:
346 |                             gt_valset = []
347 |                             pr_valset = []
348 |                             
349 |                             for gt_unit in c_gt : 
350 |                                 if gt_unit[-1] == '':
351 |                                     gt_unit[-1] = gt_unit[1]
352 |                                     gt_valset.append(gt_unit)
353 |                                 gt_valset.append(gt_unit[-1].split('++'))
354 |                                 
355 |                             for pr_unit in c_pr : 
356 |                                 if pr_unit[-1] == '':
357 |                                     pr_unit[-1] = pr_unit[1]
358 |                                     pr_valset.append(pr_unit)
359 |                                 else : 
360 |                                     pr_valset.append(pr_unit[-1].split('++'))
361 | 
362 |                             while not all_element_int(gt_valset) : 
363 |                                 pair = (-1,-1)
364 |                                 max_dupl = float('-inf')
365 |                                 for index_gt, value_gt in enumerate(gt_valset) : 
366 |                                     for index_pr, value_pr in enumerate(pr_valset) : 
367 |                                         if finding_pair(value_gt, value_pr) > 0 : 
368 |                                             if finding_pair(value_gt, value_pr) > max_dupl : 
369 |                                                 max_dupl = finding_pair(value_gt, value_pr)
370 |                                                 pair = (index_gt, index_pr)
371 |   
372 |                                 if pair != (-1, -1): 
373 |                                     lenlen = len(gt_valset[index_gt])
374 | 
375 |                                     if  c_gt[pair[0]][1] == c_pr[pair[1]][1] : 
376 |                                         corrected.append(c_gt[pair[0]][1])
377 | 
378 |                                     else : 
379 |                                         incorrected.append(c_gt[pair[0]][1])
380 |                                         
381 |                                     gt_valset[pair[0]] = [random.randint(0, 1000000) for _ in range(lenlen)]
382 |                                     pr_valset[pair[1]] = [random.randint(0, 1000000) for _ in range(lenlen)]
383 |                                     
384 | 
385 |                                     c_gt[pair[0]] = [random.randint(0, 1000000) for _ in range(lenlen)]
386 |                                     c_pr[pair[1]] = [random.randint(0, 1000000) for _ in range(lenlen)]
387 |                                 
388 |                                 else : 
389 |                                     break  
390 |                                 
391 |         total_value_accuracy = len(corrected) / total
392 |         return total_value_accuracy
393 | 
394 |                 
395 |     
396 |     
397 |     
398 | if __name__ == '__main__':
399 |     
400 |     prediction_folder = 'PREDICTION FOLDER PATH'
401 |     groundTruth_folder = 'GROUND TURTH FOLDER PATH'
402 | 
403 | 
404 |     test_list = os.listdir(prediction_folder)
405 |     value_ = []
406 |     error = []
407 |     
408 |     # CALCULATE STRUCTURE F1 SCORE
409 |     for file in test_list : 
410 |         try : 
411 |             prediction_path = prediction_folder + '/' + file
412 |             groundTruth_path = groundTruth_folder+ '/' + file
413 |             
414 |             score = evaluation(prediction_path, groundTruth_path)
415 |             # ##################### 벨류 평가 ######################
416 |             # value = score.run2()
417 |             # if value != 0 :
418 |             #     value_.append(value)
419 |                         
420 |             value, _ = score.run()
421 |             value_.append(value)
422 | 
423 |         except : 
424 |             value_.append(0)
425 |             error.append(file)
426 | 
427 |     final_value = (sum(value_)) / len(value_)
428 |     print('===========final value===========')
429 |     print(final_value)
430 |     
431 |     
432 |     # CALCULATE VALUE ACCURACY
433 |     for file in test_list : 
434 |             prediction_path = prediction_folder + '/' + file
435 |             groundTruth_path = groundTruth_folder+ '/' + file
436 | 
437 |             score = evaluation(prediction_path, groundTruth_path)
438 |             value, _ = score.run()
439 |             value_.append(value)
440 |             
441 |     final_value = (sum(value_)) / len(value_)
442 |     print('===========final value===========')
443 |     print(final_value)
444 | 
445 | 
446 | 
447 | 
448 | 
449 | 
450 | 
451 | 
452 | 
453 | 
454 | 
455 | 
456 | 
457 | 
458 | 
459 | 
460 | 
461 | 
462 | #     prediction_folder = 'C:/NLP/TableProject/ssss_v2'
463 | #     groundTruth_folder = 'C:/NLP/TableProject/ssss_v2'
464 | 
465 | # # *********************************************************************************************
466 |     
467 | #     test_list = os.listdir(prediction_folder)
468 | #     value_ = []
469 | #     error = []
470 | #     # for file in test_list : 
471 |         
472 | #         # gy_file = file.split('.')[0] + '_converted.json'
473 | 
474 | #     prediction_path = prediction_folder + '/pr_Elsevier_OER_05189_tbl05.json'
475 | #     groundTruth_path = groundTruth_folder+ '/gt_Elsevier_OER_05189_tbl05.json'
476 | #     # prediction_path = prediction_folder + '/' + file
477 | #     # groundTruth_path = groundTruth_folder+ '/' + file
478 | #     score = evaluation(prediction_path, groundTruth_path)
479 | #     ##################### 벨류 평가 ######################
480 | #     value = score.run2()
481 | #     # if value != 0 :
482 | #     #     value_.append(value)  
483 | 
484 | 
485 |         
486 | #     # final_value = (sum(value_)) / len(value_)
487 | #     # print('===========final value===========')
488 | #     # print(final_value)
489 | #     # print(error)


--------------------------------------------------------------------------------
/model_evaluation/utils/__pycache__/functions.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/model_evaluation/utils/__pycache__/functions.cpython-37.pyc


--------------------------------------------------------------------------------
/model_evaluation/utils/__pycache__/get_keys.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/model_evaluation/utils/__pycache__/get_keys.cpython-37.pyc


--------------------------------------------------------------------------------
/model_evaluation/utils/__pycache__/get_keys_function.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/model_evaluation/utils/__pycache__/get_keys_function.cpython-37.pyc


--------------------------------------------------------------------------------
/model_evaluation/utils/__pycache__/get_keys_function.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/model_evaluation/utils/__pycache__/get_keys_function.cpython-38.pyc


--------------------------------------------------------------------------------
/model_evaluation/utils/functions.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import re
  3 | import json
  4 | 
  5 | def get_keys(d, parent_key='', sep='//'):
  6 |     keys = []
  7 |     key_list2 = []
  8 |     for k, v in d.items():
  9 |         new_key = parent_key+sep+k if parent_key else k
 10 |         
 11 |         if isinstance(v, list):
 12 |             if k == 'ref' : 
 13 |                 new_key = new_key + '****' + str(v)
 14 |                 keys.append(new_key)
 15 |             
 16 |             else : 
 17 |                 keys.append(new_key)
 18 |                 for i in v : 
 19 |                     if type(i) != str and type(i) != float and type(i) != int: 
 20 |                         keys.extend(get_keys(i, new_key, sep=sep))
 21 |                     else : 
 22 |                         new_key = new_key
 23 |         if isinstance(v, dict):
 24 |             keys.append(new_key)
 25 |             keys.extend(get_keys(v, new_key, sep=sep))
 26 |         
 27 |         if type(v) == str or type(v) == float or type(v) == int: 
 28 |             new_key = new_key + '****' + str(v)
 29 |             keys.append(new_key)  
 30 |     for j in keys : 
 31 |         if j.split("//")[0] == "catalysts" or j.split("//")[0] == "catalyst" : 
 32 |             j = j.split('//')
 33 |             j = "//".join(j[1:])
 34 |             key_list2.append(j)
 35 |         else : 
 36 |             key_list2.append(j)
 37 |             
 38 |     return key_list2 
 39 | 
 40 | def add_indices_to_duplicates(input_list):
 41 |     index_dict = defaultdict(list)
 42 | 
 43 |     for index, item in enumerate(input_list):
 44 |         index_dict[item].append(index)
 45 | 
 46 |     output_list = []
 47 | 
 48 |     for item, indices in index_dict.items():
 49 |         if len(indices) > 1:
 50 |             for i, index in enumerate(indices, 1):
 51 |                     modified_item = item.replace('//', f'//(index{i})')
 52 |                 else:
 53 |                     modified_item = item
 54 |                 output_list.append(modified_item)
 55 |         else:
 56 |             output_list.append(item)
 57 | 
 58 |     return output_list
 59 | 
 60 | def contains_list(data):
 61 |     if isinstance(data, list):
 62 |         return True
 63 |     if isinstance(data, dict):
 64 |         for key, value in data.items():
 65 |             if contains_list(value):
 66 |                 return True
 67 |     return False
 68 | 
 69 | def get_keys_for_value_accuracy(d, parent_key='', sep='//'):
 70 |     keys = []
 71 |     key_list2 = []
 72 |     for k, v in d.items():
 73 |         new_key = parent_key + sep + k if parent_key else k
 74 | 
 75 |         if isinstance(v, list):
 76 |             keys.append(new_key)
 77 |             for i in v:
 78 |                 if contains_list(i) == False : 
 79 |                     new_list = []
 80 |                     value_list = []
 81 |                     for performance_property, performance_value in i.items():
 82 |                         if performance_property != 'condition' : 
 83 |                             if isinstance(performance_value, str):
 84 | 
 85 |                                 value_list.append(performance_value)
 86 | 
 87 |                             elif isinstance(performance_value, dict):
 88 |                                 for p_p, p_v in performance_value.items():
 89 |                                     if isinstance(p_v, str):
 90 | 
 91 |                                         value_list.append(p_v)
 92 |                                     
 93 | 
 94 |                     total_value = '(('+'++'.join(value_list) + '))'
 95 |                     
 96 |                     new_dict = {}
 97 |                     for kkk, vvv in i.items():
 98 |                         if isinstance(vvv, str):    
 99 |                             new_dict[kkk+total_value] = vvv
100 |                         elif isinstance(vvv, dict):
101 |                             kkk +=total_value
102 |                             new_dict[kkk] = {}
103 | 
104 |                             for kk, vv in vvv.items():
105 |                                 new_dict[kkk][kk+total_value] = vv
106 |                     new_list.append(new_dict)
107 |                     changed_dict = new_dict
108 | 
109 |                     keys.extend(get_keys_for_value_accuracy(changed_dict, new_key, sep=sep))
110 |                 
111 |                 if contains_list(i) ==True : #
112 |                     keys.extend(get_keys_for_value_accuracy(i, new_key, sep=sep))
113 | 
114 | 
115 |         if isinstance(v, dict):
116 |             keys.append(new_key)
117 |             keys.extend(get_keys_for_value_accuracy(v, new_key, sep=sep))
118 | 
119 |         if type(v) == str:
120 |             new_key = new_key + '****' + v
121 |             keys.append(new_key)
122 | 
123 |     for j in keys:
124 |         if j.split("//")[0] == "catalysts" or j.split("//")[0] == "catalyst":
125 |             j = j.split('//')
126 |             j = "//".join(j[1:])
127 |             key_list2.append(j)
128 |         else:
129 |             key_list2.append(j)
130 |     
131 |     return key_list2 
132 | 
133 | def seperate_key_value(input_list):
134 |     pattern = re.compile(r'\(\((.*?)\)\)')  
135 |     new_list = []
136 |     for item in input_list:
137 |         match = pattern.search(item)
138 |         if match:
139 |             content = match.group(1)
140 |             result = re.sub(pattern, '', item)
141 |             new_list.append([result, content])
142 |         else:
143 |             new_list.append([item])
144 | 
145 |     return new_list
146 | 
147 | def remove_whitespace_from_keys(data):
148 |     if isinstance(data, dict):
149 |         return {key.replace(" ", ""): remove_whitespace_from_keys(value) for key, value in data.items()}
150 |     elif isinstance(data, list):
151 |         return [remove_whitespace_from_keys(item) for item in data]
152 |     else:
153 |         return data
154 | 
155 | def str_val_valset_split(list) : 
156 |     new_list = []
157 |     for item in list:
158 |         a = []
159 |         for i in item :
160 |             substrings = i.split('****')
161 |             a.append(substrings[0])
162 | 
163 |             if len(substrings) > 1:
164 |                 a.extend(substrings[1].split('++'))
165 | 
166 |         new_list.append(a)
167 |     
168 |     return new_list
169 | 
170 | def group_by_first_element(list1):
171 |     result = {}
172 |     for sublist in list1:
173 |         key = sublist[0]
174 |         if key in result:
175 |             result[key].append(sublist)
176 |         else:
177 |             result[key] = [sublist]
178 |     return list(result.values())
179 | 
180 | 
181 | def finding_pair(list1, list2) : 
182 |     intersection = len(set(list1) & set(list2))
183 |     return intersection
184 | 
185 | 
186 | def all_element_int( lst):
187 | 
188 |     for element in lst:
189 |         if isinstance(element, list):
190 |             if not all_element_int(element):
191 |                 return False
192 |         elif not isinstance(element, int):
193 |             return False
194 |     return True
195 | 
196 | def catalyst_performance(gt_list):
197 |     gt_v = [j for j in gt_list if '****' in j]
198 |     result_list_ = [k for k in gt_v if 'ref' not in k]
199 |     result_list__ = [k for k in result_list_ if 'loading' not in k]
200 |     result_list_ref = [k for k in gt_v if 'ref' in k]
201 |     result_list_loading = [k for k in gt_v if 'loading' in k]
202 |     result_list = []
203 |     for item in result_list__ : 
204 |         a = item.split('//')[:-1]
205 |         b = '//'.join(a)
206 |         result_list.append(b)
207 |     no_dupl = list(set(result_list))
208 |     return no_dupl, result_list_ref, result_list_loading
209 | 
210 | def count_number(catalyst_list, first, second ) : 
211 |     count_ = []
212 |     for index, cata in enumerate(catalyst_list):
213 |         if first in cata and second in cata[first] and cata[first][second]:
214 |             count_.append(index)
215 |     return count_
216 | 
217 | def making_new_dict(catalyst_list, first, second, count_lst): 
218 |     new_dict = {}
219 |     new_value = []
220 | 
221 |     if len(count_lst) > 1 : 
222 |         for value_index in count_lst  : 
223 |             new_value.append(catalyst_list[value_index][first][second])
224 |             
225 |         if any(isinstance(sublist, list) for sublist in new_value) : 
226 |             new_value = [item for sublist in new_value for item in (sublist if isinstance(sublist, list) else [sublist])]
227 |         new_dict.setdefault(first, {})[second] = new_value
228 | 
229 |     else :
230 |         new_dict = catalyst_list[count_lst[0]]
231 |     return new_dict
232 | 
233 | def dupl_catalyst(lst) : 
234 |     catalyst = []
235 |     for i in lst : 
236 |         if '//' not in i : 
237 |             catalyst.append(i)
238 |     
239 |     if len(catalyst) != len(list(set(catalyst))) : 
240 |         return True
241 |     
242 |     else : 
243 |         return False     
244 | 
245 | def merging_result(catalyst_list, lst):
246 |     no_dupl, result_list_ref, result_list_loading = catalyst_performance(lst)
247 |     new_result = []
248 |     for cata_perfo in no_dupl : 
249 |         f_s_lst = cata_perfo.split('//')
250 |         
251 |         if len(f_s_lst)  >1 : 
252 |             first = f_s_lst[0] 
253 |             second = f_s_lst[1] 
254 |             count = count_number(catalyst_list, first, second)
255 |             new_dict = making_new_dict(catalyst_list, first,second, count)
256 |             new_result.append(new_dict)
257 |             
258 |     ref_dict = {}
259 |     loading_dict = {}
260 |     
261 |     for ref in result_list_ref : 
262 | 
263 |         if '//' in ref : 
264 |             ref_catalyst = ref.split('//')[0]
265 |             reference = str(ref.split('****')[-1])
266 |             ref_dict.setdefault(ref_catalyst, {})['ref'] = reference
267 |             if ref_dict not in new_result : 
268 |                 new_result.append(ref_dict)
269 | 
270 |         else : 
271 |             reference = str(ref.split('****')[-1])
272 |             ref_dict["ref"] = reference
273 |             new_result.append(ref_dict)
274 |             
275 |     for lo in result_list_loading : 
276 |         if '//' in lo : 
277 |             lo_catalyst = lo.split('//')[0]
278 |             loading = str(lo.split('****')[-1])
279 |             loading_dict.setdefault(lo_catalyst, {})["loading"] = loading
280 |             new_result.append(loading_dict)
281 |         else : 
282 |             loading = str(lo.split('****')[-1])
283 |             loading_dict["loading"] = loading
284 |             new_result.append(new_dloading_dictict)
285 |     result_dict = {}
286 |     for item in new_result:
287 |         key = next(iter(item))  
288 |         if key in result_dict:
289 |             result_dict[key].update(item[key])
290 |         else:
291 |             result_dict[key] = item[key]
292 |     result_list = [{key: value} for key, value in result_dict.items()]
293 |     return(result_list)
294 | 
295 | def flatten_list(nested_list):
296 |     flat_list = []
297 |     for element in nested_list:
298 |         if isinstance(element, list):
299 |             flat_list.extend(flatten_list(element))
300 |         else:
301 |             flat_list.append(element)
302 |     return flat_list
303 | 


--------------------------------------------------------------------------------
/model_script.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model" :{
 3 |         "fine_tuning" : {
 4 |             "input_path" : "./split/tsv_representation/",
 5 |             "output_path" : "./data/result/fine_tuning",
 6 |             "fq" : {
 7 |                 "path" : "./data/result/fine_tuning"
 8 |             }
 9 |         },
10 |         "few_shot" : {
11 |             "input_path" : "./data/split/tsv_representation/",
12 |             "output_path" : "./data/result/few_shot",
13 |             "fq" : {
14 |                 "path" : "./data/result/few_shot"
15 |             }
16 |         },
17 |         "zero_shot" : {
18 |             "input_path" : "./data/split/tsv_representation/",
19 |             "output_path" : "./data/result/zero_shot",
20 |             "fq" : {
21 |                 "path" : "./data/result/zero_shot"
22 |             }
23 |         }
24 |     }
25 | }


--------------------------------------------------------------------------------
/requirements_conda.txt:
--------------------------------------------------------------------------------
  1 | # This file may be used to create an environment using:
  2 | # $ conda create --name <env> --file <this file>
  3 | # platform: win-64
  4 | absl-py=0.14.1=pypi_0
  5 | aiohttp=3.8.1=pypi_0
  6 | aiosignal=1.2.0=pypi_0
  7 | alabaster=0.7.12=pypi_0
  8 | allennlp=0.9.0=pypi_0
  9 | altair=4.2.2=pypi_0
 10 | annotated-types=0.5.0=pypi_0
 11 | anyio=3.7.1=pypi_0
 12 | appdirs=1.4.4=pypi_0
 13 | argcomplete=1.12.3=pypi_0
 14 | argon2-cffi=21.1.0=pypi_0
 15 | astunparse=1.6.3=pypi_0
 16 | async-generator=1.10=pypi_0
 17 | async-timeout=4.0.1=pypi_0
 18 | asynctest=0.13.0=pypi_0
 19 | atomicwrites=1.4.0=pypi_0
 20 | attrs=22.2.0=pypi_0
 21 | babel=2.9.1=pypi_0
 22 | backcall=0.2.0=pypi_0
 23 | backports-csv=1.0.7=pypi_0
 24 | backports-zoneinfo=0.2.1=pypi_0
 25 | base58=2.1.1=pypi_0
 26 | beautifulsoup4=4.11.2=pypi_0
 27 | behave=1.2.6=pypi_0
 28 | bertopic=0.14.1=pypi_0
 29 | blas=1.0=mkl
 30 | bleach=4.1.0=pypi_0
 31 | blinker=1.6=pypi_0
 32 | blis=0.2.4=pypi_0
 33 | boto3=1.19.2=pypi_0
 34 | botocore=1.22.2=pypi_0
 35 | bottleneck=1.3.2=py37h2a96729_1
 36 | bpemb=0.3.3=pypi_0
 37 | ca-certificates=2021.9.30=haa95532_1
 38 | cached-path=0.3.2=pypi_0
 39 | cached-property=1.5.2=pypi_0
 40 | cachetools=4.2.4=pypi_0
 41 | catalogue=2.0.6=pypi_0
 42 | certifi=2022.12.7=pypi_0
 43 | cffi=1.15.1=pypi_0
 44 | chardet=5.1.0=pypi_0
 45 | charset-normalizer=2.1.1=pypi_0
 46 | checklist=0.0.11=pypi_0
 47 | chemdataextractor=1.3.0=pypi_0
 48 | cheroot=8.5.2=pypi_0
 49 | cherrypy=18.6.1=pypi_0
 50 | chromedriver-autoinstaller=0.4.0=pypi_0
 51 | click=8.0.3=pypi_0
 52 | cloudpickle=2.0.0=pypi_0
 53 | colorama=0.4.6=pypi_0
 54 | configparser=5.1.0=pypi_0
 55 | conllu=4.0=pypi_0
 56 | cryptography=39.0.1=pypi_0
 57 | cssselect=1.2.0=pypi_0
 58 | cycler=0.10.0=pypi_0
 59 | cymem=2.0.6=pypi_0
 60 | cython=0.29.14=pypi_0
 61 | datasets=1.16.1=pypi_0
 62 | dawg=0.8.0=pypi_0
 63 | debugpy=1.5.1=pypi_0
 64 | decorator=5.1.0=pypi_0
 65 | deepdiff=6.5.0=pypi_0
 66 | defusedxml=0.7.1=pypi_0
 67 | deprecated=1.2.13=pypi_0
 68 | dill=0.3.4=pypi_0
 69 | distro=1.9.0=pypi_0
 70 | docker-pycreds=0.4.0=pypi_0
 71 | docutils=0.17.1=pypi_0
 72 | editdistance=0.6.0=pypi_0
 73 | entrypoints=0.3=pypi_0
 74 | et-xmlfile=1.1.0=pypi_0
 75 | exceptiongroup=1.1.0=pypi_0
 76 | execnet=1.9.0=pypi_0
 77 | fairscale=0.4.0=pypi_0
 78 | fasteners=0.18=pypi_0
 79 | feedparser=6.0.8=pypi_0
 80 | filelock=3.9.0=pypi_0
 81 | flair=0.9=pypi_0
 82 | flaky=3.7.0=pypi_0
 83 | flask=2.0.2=pypi_0
 84 | flask-cors=3.0.10=pypi_0
 85 | flatbuffers=1.12=pypi_0
 86 | flatten-dict=0.4.2=pypi_0
 87 | frozenlist=1.2.0=pypi_0
 88 | fsspec=2021.11.1=pypi_0
 89 | ftfy=6.0.3=pypi_0
 90 | funcy=2.0=pypi_0
 91 | future=0.18.2=pypi_0
 92 | gast=0.3.3=pypi_0
 93 | gdown=3.12.2=pypi_0
 94 | gensim=3.8.3=pypi_0
 95 | gevent=21.8.0=pypi_0
 96 | gitdb=4.0.9=pypi_0
 97 | gitpython=3.1.24=pypi_0
 98 | glove-python-binary=0.2.0=pypi_0
 99 | google-api-core=2.2.2=pypi_0
100 | google-auth=2.3.0=pypi_0
101 | google-auth-oauthlib=0.4.6=pypi_0
102 | google-cloud-core=2.2.1=pypi_0
103 | google-cloud-storage=1.43.0=pypi_0
104 | google-crc32c=1.3.0=pypi_0
105 | google-pasta=0.2.0=pypi_0
106 | google-resumable-media=2.1.0=pypi_0
107 | googleapis-common-protos=1.53.0=pypi_0
108 | greenlet=1.1.2=pypi_0
109 | grpcio=1.32.0=pypi_0
110 | h11=0.14.0=pypi_0
111 | h2o=3.40.0.4=pypi_0
112 | h5py=2.10.0=pypi_0
113 | hdbscan=0.8.29=pypi_0
114 | httpcore=0.17.3=pypi_0
115 | httpx=0.24.1=pypi_0
116 | huggingface-hub=0.13.3=pypi_0
117 | hyperopt=0.2.5=pypi_0
118 | idna=3.4=pypi_0
119 | imagesize=1.2.0=pypi_0
120 | importlib-metadata=6.6.0=pypi_0
121 | importlib-resources=5.4.0=pypi_0
122 | iniconfig=2.0.0=pypi_0
123 | install-jdk=1.0.4=pypi_0
124 | intel-openmp=2021.3.0=haa95532_3372
125 | ipykernel=6.5.1=pypi_0
126 | ipython=7.30.0=pypi_0
127 | ipython-genutils=0.2.0=pypi_0
128 | ipywidgets=7.6.5=pypi_0
129 | iso-639=0.4.5=pypi_0
130 | itsdangerous=2.0.1=pypi_0
131 | janome=0.4.1=pypi_0
132 | jaraco-classes=3.2.1=pypi_0
133 | jaraco-collections=3.4.0=pypi_0
134 | jaraco-functools=3.4.0=pypi_0
135 | jaraco-text=3.6.0=pypi_0
136 | jedi=0.18.1=pypi_0
137 | jinja2=3.0.2=pypi_0
138 | jmespath=0.10.0=pypi_0
139 | joblib=1.1.0=pypi_0
140 | jsonlines=3.1.0=pypi_0
141 | jsonpickle=2.0.0=pypi_0
142 | jsonschema=4.2.1=pypi_0
143 | jupyter=1.0.0=pypi_0
144 | jupyter-client=7.1.0=pypi_0
145 | jupyter-console=6.4.0=pypi_0
146 | jupyter-core=4.9.1=pypi_0
147 | jupyterlab-pygments=0.1.2=pypi_0
148 | jupyterlab-widgets=1.0.2=pypi_0
149 | keras-preprocessing=1.1.2=pypi_0
150 | keyring=23.9.3=pypi_0
151 | kiwisolver=1.3.2=pypi_0
152 | konoha=4.6.5=pypi_0
153 | langdetect=1.0.9=pypi_0
154 | llvmlite=0.39.1=pypi_0
155 | lmdb=1.2.1=pypi_0
156 | lxml=4.6.3=pypi_0
157 | markdown=3.3.4=pypi_0
158 | markdown-it-py=2.1.0=pypi_0
159 | markupsafe=2.0.1=pypi_0
160 | matplotlib=3.4.3=pypi_0
161 | matplotlib-inline=0.1.3=pypi_0
162 | mdurl=0.1.2=pypi_0
163 | mistune=0.8.4=pypi_0
164 | mkl=2021.3.0=haa95532_524
165 | mkl-service=2.4.0=py37h2bbff1b_0
166 | mkl_fft=1.3.0=py37h277e83a_2
167 | mkl_random=1.2.2=py37hf11a4ad_0
168 | mlxtend=0.23.1=pypi_0
169 | more-itertools=9.0.0=pypi_0
170 | mpld3=0.3=pypi_0
171 | multidict=5.2.0=pypi_0
172 | multiprocess=0.70.12.2=pypi_0
173 | munch=2.5.0=pypi_0
174 | murmurhash=1.0.6=pypi_0
175 | nbclient=0.5.9=pypi_0
176 | nbconvert=6.3.0=pypi_0
177 | nbformat=5.1.3=pypi_0
178 | nest-asyncio=1.5.1=pypi_0
179 | networkx=2.6.3=pypi_0
180 | nltk=3.6.5=pypi_0
181 | nose=1.3.7=pypi_0
182 | notebook=6.4.6=pypi_0
183 | numba=0.56.4=pypi_0
184 | numexpr=2.7.3=py37hb80d3ca_1
185 | numpy=1.19.5=pypi_0
186 | numpy-base=1.21.2=py37h0829f74_0
187 | numpydoc=1.1.0=pypi_0
188 | oauthlib=3.1.1=pypi_0
189 | openai=1.16.2=pypi_0
190 | openpyxl=3.0.9=pypi_0
191 | openssl=1.1.1l=h2bbff1b_0
192 | opt-einsum=3.3.0=pypi_0
193 | ordered-set=4.1.0=pypi_0
194 | outcome=1.2.0=pypi_0
195 | overrides=3.1.0=pypi_0
196 | packaging=23.0=pypi_0
197 | pandas=1.3.3=py37h6214cd6_0
198 | pandocfilters=1.5.0=pypi_0
199 | parameterized=0.8.1=pypi_0
200 | parse=1.19.0=pypi_0
201 | parse-type=0.6.0=pypi_0
202 | parsimonious=0.8.1=pypi_0
203 | parso=0.8.2=pypi_0
204 | pathtools=0.1.2=pypi_0
205 | pathy=0.6.1=pypi_0
206 | patternfork-nosql=3.6=pypi_0
207 | pdbp=1.2.8=pypi_0
208 | pdfminer=20191125=pypi_0
209 | pdfminer-six=20211012=pypi_0
210 | pickle5=0.0.12=pypi_0
211 | pickleshare=0.7.5=pypi_0
212 | pillow=8.4.0=pypi_0
213 | pip=24.0=pypi_0
214 | plac=0.9.6=pypi_0
215 | platformdirs=3.0.0=pypi_0
216 | plotly=5.14.1=pypi_0
217 | pluggy=1.0.0=pypi_0
218 | portend=3.1.0=pypi_0
219 | preshed=2.0.1=pypi_0
220 | prometheus-client=0.12.0=pypi_0
221 | promise=2.3=pypi_0
222 | prompt-toolkit=3.0.23=pypi_0
223 | protobuf=3.18.1=pypi_0
224 | psutil=5.8.0=pypi_0
225 | py=1.11.0=pypi_0
226 | pyarrow=6.0.1=pypi_0
227 | pyasn1=0.4.8=pypi_0
228 | pyasn1-modules=0.2.8=pypi_0
229 | pycparser=2.21=pypi_0
230 | pycryptodome=3.11.0=pypi_0
231 | pydantic=2.5.3=pypi_0
232 | pydantic-core=2.14.6=pypi_0
233 | pydeck=0.8.0=pypi_0
234 | pygments=2.14.0=pypi_0
235 | pyldavis=3.3.1=pypi_0
236 | pympler=1.0.1=pypi_0
237 | pynndescent=0.5.10=pypi_0
238 | pyopenssl=23.0.0=pypi_0
239 | pyotp=2.8.0=pypi_0
240 | pyparsing=2.4.7=pypi_0
241 | pyreadline3=3.4.1=pypi_0
242 | pyrsistent=0.18.0=pypi_0
243 | pysocks=1.7.1=pypi_0
244 | pytest=7.2.1=pypi_0
245 | pytest-forked=1.6.0=pypi_0
246 | pytest-html=2.0.1=pypi_0
247 | pytest-metadata=2.0.4=pypi_0
248 | pytest-ordering=0.6=pypi_0
249 | pytest-rerunfailures=11.1.1=pypi_0
250 | pytest-xdist=3.2.0=pypi_0
251 | python=3.7.11=h6244533_0
252 | python-crfsuite=0.9.7=pypi_0
253 | python-dateutil=2.8.2=pyhd3eb1b0_0
254 | python-docx=0.8.11=pypi_0
255 | python-dotenv=0.21.1=pypi_0
256 | pytorch-pretrained-bert=0.6.2=pypi_0
257 | pytorch-transformers=1.1.0=pypi_0
258 | pytz=2021.3=pyhd3eb1b0_0
259 | pytz-deprecation-shim=0.1.0.post0=pypi_0
260 | pywin32=302=pypi_0
261 | pywin32-ctypes=0.2.0=pypi_0
262 | pywinpty=1.1.6=pypi_0
263 | pyyaml=6.0=pypi_0
264 | pyzmq=22.3.0=pypi_0
265 | qtconsole=5.2.1=pypi_0
266 | qtpy=1.11.2=pypi_0
267 | regex=2021.10.8=pypi_0
268 | requests=2.28.2=pypi_0
269 | requests-oauthlib=1.3.0=pypi_0
270 | requests-toolbelt=0.10.1=pypi_0
271 | responses=0.14.0=pypi_0
272 | rich=13.3.1=pypi_0
273 | rsa=4.7.2=pypi_0
274 | s3transfer=0.5.0=pypi_0
275 | sacremoses=0.0.46=pypi_0
276 | sbvirtualdisplay=1.2.0=pypi_0
277 | schematics=2.1.1=pypi_0
278 | scikit-learn=1.0.2=pypi_0
279 | scipy=1.7.1=pypi_0
280 | segtok=1.5.10=pypi_0
281 | selenium=4.8.2=pypi_0
282 | seleniumbase=4.13.0=pypi_0
283 | semver=3.0.0=pypi_0
284 | send2trash=1.8.0=pypi_0
285 | sentence-transformers=2.2.2=pypi_0
286 | sentencepiece=0.1.95=pypi_0
287 | sentry-sdk=1.5.0=pypi_0
288 | seqeval=1.2.2=pypi_0
289 | setuptools=67.3.3=pypi_0
290 | sgmllib3k=1.0.0=pypi_0
291 | shortuuid=1.0.8=pypi_0
292 | simpletransformers=0.63.9=pypi_0
293 | six=1.15.0=pypi_0
294 | sklearn=0.0.post5=pypi_0
295 | smart-open=5.2.1=pypi_0
296 | smmap=5.0.0=pypi_0
297 | sniffio=1.3.0=pypi_0
298 | snowballstemmer=2.1.0=pypi_0
299 | sortedcontainers=2.4.0=pypi_0
300 | soupsieve=2.4=pypi_0
301 | spacy=2.1.9=pypi_0
302 | spacy-legacy=3.0.8=pypi_0
303 | sphinx=4.2.0=pypi_0
304 | sphinxcontrib-applehelp=1.0.2=pypi_0
305 | sphinxcontrib-devhelp=1.0.2=pypi_0
306 | sphinxcontrib-htmlhelp=2.0.0=pypi_0
307 | sphinxcontrib-jsmath=1.0.1=pypi_0
308 | sphinxcontrib-qthelp=1.0.3=pypi_0
309 | sphinxcontrib-serializinghtml=1.1.5=pypi_0
310 | sqlite=3.36.0=h2bbff1b_0
311 | sqlitedict=1.7.0=pypi_0
312 | sqlparse=0.4.2=pypi_0
313 | srsly=1.0.5=pypi_0
314 | streamlit=1.20.0=pypi_0
315 | subprocess32=3.5.4=pypi_0
316 | tabcompleter=1.1.0=pypi_0
317 | tabulate=0.8.9=pypi_0
318 | tempora=4.1.2=pypi_0
319 | tenacity=8.2.2=pypi_0
320 | tensorboard=2.7.0=pypi_0
321 | tensorboard-data-server=0.6.1=pypi_0
322 | tensorboard-plugin-wit=1.8.0=pypi_0
323 | tensorboardx=2.4=pypi_0
324 | tensorflow-estimator=2.4.0=pypi_0
325 | tensorflow-gpu=2.4.1=pypi_0
326 | termcolor=1.1.0=pypi_0
327 | terminado=0.12.1=pypi_0
328 | testpath=0.5.0=pypi_0
329 | thinc=7.0.8=pypi_0
330 | threadpoolctl=3.0.0=pypi_0
331 | tokenizers=0.13.2=pypi_0
332 | toml=0.10.2=pypi_0
333 | tomli=2.0.1=pypi_0
334 | toolz=0.12.0=pypi_0
335 | torch=1.7.1+cu110=pypi_0
336 | torchaudio=0.7.2=pypi_0
337 | torchcrf=1.1.0=pypi_0
338 | torchvision=0.8.2+cu110=pypi_0
339 | tornado=6.1=pypi_0
340 | tqdm=4.64.1=pypi_0
341 | traitlets=5.1.1=pypi_0
342 | transformers=4.27.4=pypi_0
343 | trio=0.22.0=pypi_0
344 | trio-websocket=0.9.2=pypi_0
345 | typer=0.4.0=pypi_0
346 | typing-extensions=4.7.1=pypi_0
347 | tzdata=2023.3=pypi_0
348 | tzlocal=4.3=pypi_0
349 | umap-learn=0.5.3=pypi_0
350 | unidecode=1.3.2=pypi_0
351 | urllib3=1.26.14=pypi_0
352 | validators=0.20.0=pypi_0
353 | vc=14.2=h21ff451_1
354 | vs2015_runtime=14.27.29016=h5e58377_2
355 | wandb=0.12.7=pypi_0
356 | wasabi=0.8.2=pypi_0
357 | watchdog=3.0.0=pypi_0
358 | wcwidth=0.2.5=pypi_0
359 | webdriver-manager=3.8.5=pypi_0
360 | webencodings=0.5.1=pypi_0
361 | websockets=10.4=pypi_0
362 | werkzeug=2.0.2=pypi_0
363 | wheel=0.38.4=pypi_0
364 | widgetsnbextension=3.5.2=pypi_0
365 | wikipedia-api=0.5.4=pypi_0
366 | wincertstore=0.2=py37haa95532_2
367 | word2number=1.1=pypi_0
368 | wrapt=1.12.1=pypi_0
369 | wsproto=1.2.0=pypi_0
370 | xxhash=2.0.2=pypi_0
371 | yarl=1.7.2=pypi_0
372 | yaspin=2.1.0=pypi_0
373 | zc-lockfile=2.0=pypi_0
374 | zipp=3.14.0=pypi_0
375 | zope-event=4.5.0=pypi_0
376 | zope-interface=5.4.0=pypi_0
377 | 


--------------------------------------------------------------------------------
/requirements_pip.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.14.1
  2 | aiohttp==3.8.1
  3 | aiosignal==1.2.0
  4 | alabaster==0.7.12
  5 | allennlp==0.9.0
  6 | altair==4.2.2
  7 | annotated-types==0.5.0
  8 | anyio==3.7.1
  9 | appdirs==1.4.4
 10 | argcomplete==1.12.3
 11 | argon2-cffi==21.1.0
 12 | astunparse==1.6.3
 13 | async-generator==1.10
 14 | async-timeout==4.0.1
 15 | asynctest==0.13.0
 16 | atomicwrites==1.4.0
 17 | attrs==22.2.0
 18 | Babel==2.9.1
 19 | backcall==0.2.0
 20 | backports.csv==1.0.7
 21 | backports.zoneinfo==0.2.1
 22 | base58==2.1.1
 23 | beautifulsoup4==4.11.2
 24 | behave==1.2.6
 25 | bertopic==0.14.1
 26 | bleach==4.1.0
 27 | blinker==1.6
 28 | blis==0.2.4
 29 | boto3==1.19.2
 30 | botocore==1.22.2
 31 | Bottleneck==1.3.2
 32 | bpemb==0.3.3
 33 | cached-path==0.3.2
 34 | cached-property==1.5.2
 35 | cachetools==4.2.4
 36 | catalogue==2.0.6
 37 | certifi==2022.12.7
 38 | cffi==1.15.1
 39 | chardet==5.1.0
 40 | charset-normalizer==2.1.1
 41 | checklist==0.0.11
 42 | ChemDataExtractor==1.3.0
 43 | cheroot==8.5.2
 44 | CherryPy==18.6.1
 45 | chromedriver-autoinstaller==0.4.0
 46 | click==8.0.3
 47 | cloudpickle==2.0.0
 48 | colorama==0.4.6
 49 | configparser==5.1.0
 50 | conllu==4.0
 51 | cryptography==39.0.1
 52 | cssselect==1.2.0
 53 | cycler==0.10.0
 54 | cymem==2.0.6
 55 | Cython==0.29.14
 56 | datasets==1.16.1
 57 | DAWG==0.8.0
 58 | debugpy==1.5.1
 59 | decorator==5.1.0
 60 | deepdiff==6.5.0
 61 | defusedxml==0.7.1
 62 | Deprecated==1.2.13
 63 | dill==0.3.4
 64 | distro==1.9.0
 65 | docker-pycreds==0.4.0
 66 | docutils==0.17.1
 67 | editdistance==0.6.0
 68 | entrypoints==0.3
 69 | et-xmlfile==1.1.0
 70 | exceptiongroup==1.1.0
 71 | execnet==1.9.0
 72 | fairscale==0.4.0
 73 | fasteners==0.18
 74 | feedparser==6.0.8
 75 | filelock==3.9.0
 76 | flair==0.9
 77 | flaky==3.7.0
 78 | Flask==2.0.2
 79 | Flask-Cors==3.0.10
 80 | flatbuffers==1.12
 81 | flatten-dict==0.4.2
 82 | frozenlist==1.2.0
 83 | fsspec==2021.11.1
 84 | ftfy==6.0.3
 85 | funcy==2.0
 86 | future==0.18.2
 87 | gast==0.3.3
 88 | gdown==3.12.2
 89 | gensim==3.8.3
 90 | gevent==21.8.0
 91 | gitdb==4.0.9
 92 | GitPython==3.1.24
 93 | glove-python-binary==0.2.0
 94 | google-api-core==2.2.2
 95 | google-auth==2.3.0
 96 | google-auth-oauthlib==0.4.6
 97 | google-cloud-core==2.2.1
 98 | google-cloud-storage==1.43.0
 99 | google-crc32c==1.3.0
100 | google-pasta==0.2.0
101 | google-resumable-media==2.1.0
102 | googleapis-common-protos==1.53.0
103 | greenlet==1.1.2
104 | grpcio==1.32.0
105 | h11==0.14.0
106 | h2o==3.40.0.4
107 | h5py==2.10.0
108 | hdbscan==0.8.29
109 | httpcore==0.17.3
110 | httpx==0.24.1
111 | huggingface-hub==0.13.3
112 | hyperopt==0.2.5
113 | idna==3.4
114 | imagesize==1.2.0
115 | importlib-metadata==6.6.0
116 | importlib-resources==5.4.0
117 | iniconfig==2.0.0
118 | install-jdk==1.0.4
119 | ipykernel==6.5.1
120 | ipython==7.30.0
121 | ipython-genutils==0.2.0
122 | ipywidgets==7.6.5
123 | iso-639==0.4.5
124 | itsdangerous==2.0.1
125 | Janome==0.4.1
126 | jaraco.classes==3.2.1
127 | jaraco.collections==3.4.0
128 | jaraco.functools==3.4.0
129 | jaraco.text==3.6.0
130 | jedi==0.18.1
131 | Jinja2==3.0.2
132 | jmespath==0.10.0
133 | joblib==1.1.0
134 | jsonlines==3.1.0
135 | jsonpickle==2.0.0
136 | jsonschema==4.2.1
137 | jupyter==1.0.0
138 | jupyter-client==7.1.0
139 | jupyter-console==6.4.0
140 | jupyter-core==4.9.1
141 | jupyterlab-pygments==0.1.2
142 | jupyterlab-widgets==1.0.2
143 | Keras-Preprocessing==1.1.2
144 | keyring==23.9.3
145 | kiwisolver==1.3.2
146 | konoha==4.6.5
147 | langdetect==1.0.9
148 | llvmlite==0.39.1
149 | lmdb==1.2.1
150 | lxml==4.6.3
151 | Markdown==3.3.4
152 | markdown-it-py==2.1.0
153 | MarkupSafe==2.0.1
154 | matplotlib==3.4.3
155 | matplotlib-inline==0.1.3
156 | mdurl==0.1.2
157 | mistune==0.8.4
158 | mkl-fft==1.3.0
159 | mkl-random @ file:///C:/ci/mkl_random_1626186163140/work
160 | mkl-service==2.4.0
161 | mlxtend==0.23.1
162 | more-itertools==9.0.0
163 | mpld3==0.3
164 | multidict==5.2.0
165 | multiprocess==0.70.12.2
166 | munch==2.5.0
167 | murmurhash==1.0.6
168 | nbclient==0.5.9
169 | nbconvert==6.3.0
170 | nbformat==5.1.3
171 | nest-asyncio==1.5.1
172 | networkx==2.6.3
173 | nltk==3.6.5
174 | nose==1.3.7
175 | notebook==6.4.6
176 | numba==0.56.4
177 | numexpr @ file:///C:/ci/numexpr_1618856761305/work
178 | numpy==1.19.5
179 | numpydoc==1.1.0
180 | oauthlib==3.1.1
181 | openai==1.16.2
182 | openpyxl==3.0.9
183 | opt-einsum==3.3.0
184 | ordered-set==4.1.0
185 | outcome==1.2.0
186 | overrides==3.1.0
187 | packaging==23.0
188 | pandas @ file:///C:/ci/pandas_1632920019983/work
189 | pandocfilters==1.5.0
190 | parameterized==0.8.1
191 | parse==1.19.0
192 | parse-type==0.6.0
193 | parsimonious==0.8.1
194 | parso==0.8.2
195 | pathtools==0.1.2
196 | pathy==0.6.1
197 | patternfork-nosql==3.6
198 | pdbp==1.2.8
199 | pdfminer==20191125
200 | pdfminer.six==20211012
201 | pickle5==0.0.12
202 | pickleshare==0.7.5
203 | Pillow==8.4.0
204 | plac==0.9.6
205 | platformdirs==3.0.0
206 | plotly==5.14.1
207 | pluggy==1.0.0
208 | portend==3.1.0
209 | preshed==2.0.1
210 | prometheus-client==0.12.0
211 | promise==2.3
212 | prompt-toolkit==3.0.23
213 | protobuf==3.18.1
214 | psutil==5.8.0
215 | py==1.11.0
216 | pyarrow==6.0.1
217 | pyasn1==0.4.8
218 | pyasn1-modules==0.2.8
219 | pycparser==2.21
220 | pycryptodome==3.11.0
221 | pydantic==2.5.3
222 | pydantic_core==2.14.6
223 | pydeck==0.8.0
224 | Pygments==2.14.0
225 | pyLDAvis==3.3.1
226 | Pympler==1.0.1
227 | pynndescent==0.5.10
228 | pyOpenSSL==23.0.0
229 | pyotp==2.8.0
230 | pyparsing==2.4.7
231 | pyreadline3==3.4.1
232 | pyrsistent==0.18.0
233 | PySocks==1.7.1
234 | pytest==7.2.1
235 | pytest-forked==1.6.0
236 | pytest-html==2.0.1
237 | pytest-metadata==2.0.4
238 | pytest-ordering==0.6
239 | pytest-rerunfailures==11.1.1
240 | pytest-xdist==3.2.0
241 | python-crfsuite==0.9.7
242 | python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
243 | python-docx==0.8.11
244 | python-dotenv==0.21.1
245 | pytorch-pretrained-bert==0.6.2
246 | pytorch-transformers==1.1.0
247 | pytz==2021.3
248 | pytz-deprecation-shim==0.1.0.post0
249 | pywin32==302
250 | pywin32-ctypes==0.2.0
251 | pywinpty==1.1.6
252 | PyYAML==6.0
253 | pyzmq==22.3.0
254 | qtconsole==5.2.1
255 | QtPy==1.11.2
256 | regex==2021.10.8
257 | requests==2.28.2
258 | requests-oauthlib==1.3.0
259 | requests-toolbelt==0.10.1
260 | responses==0.14.0
261 | rich==13.3.1
262 | rsa==4.7.2
263 | s3transfer==0.5.0
264 | sacremoses==0.0.46
265 | sbvirtualdisplay==1.2.0
266 | schematics==2.1.1
267 | scikit-learn==1.0.2
268 | scipy==1.7.1
269 | segtok==1.5.10
270 | selenium==4.8.2
271 | seleniumbase==4.13.0
272 | semver==3.0.0
273 | Send2Trash==1.8.0
274 | sentence-transformers==2.2.2
275 | sentencepiece==0.1.95
276 | sentry-sdk==1.5.0
277 | seqeval==1.2.2
278 | sgmllib3k==1.0.0
279 | shortuuid==1.0.8
280 | simpletransformers==0.63.9
281 | six==1.15.0
282 | sklearn==0.0.post5
283 | smart-open==5.2.1
284 | smmap==5.0.0
285 | sniffio==1.3.0
286 | snowballstemmer==2.1.0
287 | sortedcontainers==2.4.0
288 | soupsieve==2.4
289 | spacy==2.1.9
290 | spacy-legacy==3.0.8
291 | Sphinx==4.2.0
292 | sphinxcontrib-applehelp==1.0.2
293 | sphinxcontrib-devhelp==1.0.2
294 | sphinxcontrib-htmlhelp==2.0.0
295 | sphinxcontrib-jsmath==1.0.1
296 | sphinxcontrib-qthelp==1.0.3
297 | sphinxcontrib-serializinghtml==1.1.5
298 | sqlitedict==1.7.0
299 | sqlparse==0.4.2
300 | srsly==1.0.5
301 | streamlit==1.20.0
302 | subprocess32==3.5.4
303 | tabcompleter==1.1.0
304 | tabulate==0.8.9
305 | tempora==4.1.2
306 | tenacity==8.2.2
307 | tensorboard==2.7.0
308 | tensorboard-data-server==0.6.1
309 | tensorboard-plugin-wit==1.8.0
310 | tensorboardX==2.4
311 | tensorflow-estimator==2.4.0
312 | tensorflow-gpu==2.4.1
313 | termcolor==1.1.0
314 | terminado==0.12.1
315 | testpath==0.5.0
316 | thinc==7.0.8
317 | threadpoolctl==3.0.0
318 | tokenizers==0.13.2
319 | toml==0.10.2
320 | tomli==2.0.1
321 | toolz==0.12.0
322 | torch==1.7.1+cu110
323 | torchaudio==0.7.2
324 | TorchCRF==1.1.0
325 | torchvision==0.8.2+cu110
326 | tornado==6.1
327 | tqdm==4.64.1
328 | traitlets==5.1.1
329 | transformers==4.27.4
330 | trio==0.22.0
331 | trio-websocket==0.9.2
332 | typer==0.4.0
333 | typing_extensions==4.7.1
334 | tzdata==2023.3
335 | tzlocal==4.3
336 | umap-learn==0.5.3
337 | Unidecode==1.3.2
338 | urllib3==1.26.14
339 | validators==0.20.0
340 | wandb==0.12.7
341 | wasabi==0.8.2
342 | watchdog==3.0.0
343 | wcwidth==0.2.5
344 | webdriver-manager==3.8.5
345 | webencodings==0.5.1
346 | websockets==10.4
347 | Werkzeug==2.0.2
348 | widgetsnbextension==3.5.2
349 | Wikipedia-API==0.5.4
350 | wincertstore==0.2
351 | word2number==1.1
352 | wrapt==1.12.1
353 | wsproto==1.2.0
354 | xxhash==2.0.2
355 | yarl==1.7.2
356 | yaspin==2.1.0
357 | zc.lockfile==2.0
358 | zipp==3.14.0
359 | zope.event==4.5.0
360 | zope.interface==5.4.0
361 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from table_splitting.split_table import *
  3 | from table_representation.table_representer import TableRepresenter
  4 | from table_representation.table2json import TableProcessor
  5 | from GPT_models import models
  6 | from GPT_models.follow_up_q import *
  7 | 
  8 | 
  9 | def input_generation(splitting, table_representation) : 
 10 |     '''
 11 |     Generates input file
 12 |     
 13 |     Parameters
 14 |     splitting : "split" or "non_split"
 15 |     table_representation : "TSV" or "JSON"
 16 |     
 17 |     returns : inpur for model test
 18 |     '''
 19 |     
 20 |     with open('Z:/NLP Project/table/code_upload/input_generation_script.json', 'r', encoding='utf-8') as file:
 21 |         data = json.load(file)  # JSON 데이터를 파이썬 객체로 변환
 22 | 
 23 |     if splitting == "non_split" : 
 24 |         if table_representation == "JSON" : 
 25 |             input_path = data["input_generator"]["splitting_HTML"]["table_representation"]["JSON"]["input_path"]
 26 |             output_path = data["input_generator"]["splitting_HTML"]["table_representation"]["JSON"]["output_path"]
 27 |             
 28 |             json_file_list = os.listdir(input_path)
 29 |             
 30 |             for i in json_file_list:
 31 |                 a = i.split('.')[0]
 32 |                 table_processor = TableProcessor(input_path + a + '.json')
 33 |                 table_processor.convert_to_json(i, output_path)
 34 |         
 35 |         if table_representation == "TSV" : 
 36 |             input_path = data["input_generator"]["splitting_HTML"]["table_representation"]["TSV"]["input_path"]
 37 |             output_path = data["input_generator"]["splitting_HTML"]["table_representation"]["TSV"]["output_path"]
 38 |             
 39 |             table_list = os.listdir(input_path)
 40 |             table = TableRepresenter(table_path) 
 41 |             
 42 |             for table_element in table_list:
 43 |                 table.run(table_element, output_path)
 44 |                 
 45 |     elif splitting == "split" :  
 46 |         input_json_path = data["input_generator"]["splitting_HTML"]["split"][0]["input_JSON_path"]
 47 |         input_pickle_path = data["input_generator"]["splitting_HTML"]["split"][0]["input_pickle_path"]
 48 |         input_HTML_path = data["input_generator"]["splitting_HTML"]["split"][0]["input_HTML_path"]
 49 |         output_HTML_path = data["input_generator"]["splitting_HTML"]["split"][0]["output_HTML_path"]
 50 |         
 51 |         body_list = TablePaser(input_json_path, input_HTML_path, input_pickle_path)
 52 |         body_list.run()
 53 |         table_spliter = DivideHtml(input_HTML_path + '/example_tbl01.html', input_pickle_path + '/example_tbl01.pickle', output_HTML_path)
 54 |         table_spliter.run()
 55 |         
 56 |         if table_representation == "JSON" : 
 57 |             input_path = data["input_generator"]["splitting_HTML"][1]["table_representation"]["JSON"]["input_path"]
 58 |             output_path = data["input_generator"]["splitting_HTML"][1]["table_representation"]["JSON"]["output_path"]
 59 |             
 60 |             json_file_list = os.listdir(input_path)
 61 |             
 62 |             for i in json_file_list:
 63 |                 a = i.split('.')[0]
 64 |                 table_processor = TableProcessor(input_path + a + '.json')
 65 |                 table_processor.convert_to_json(i, output_path)
 66 |         
 67 |         if table_representation == "TSV" : 
 68 |             input_path = data["input_generator"]["splitting_HTML"][1]["table_representation"]["TSV"]["input_path"]
 69 |             output_path = data["input_generator"]["splitting_HTML"][1]["table_representation"]["TSV"]["output_path"]
 70 |             
 71 |             table_list = os.listdir(input_path)
 72 |             table = TableRepresenter(table_path) 
 73 |             
 74 |             for table_element in table_list:
 75 |                 table.run(table_element, output_path)
 76 |     
 77 |     
 78 | def model_test(model_, fq):
 79 |     '''
 80 |     Generates the prediction files in json format
 81 |     
 82 |     Parameters
 83 |     model_ : "few_shot" or "zero_shot" or "fine_tuning"
 84 |     fq : True or False
 85 |     
 86 |     returns : data extraction result
 87 |     '''
 88 |     with open('Z:/NLP Project/table/code_upload/model_script.json', 'r', encoding='utf-8') as file:
 89 |         data = json.load(file)  
 90 |     
 91 |     input_path = data["model"][model_]["input_path"]
 92 |     output_path = data["model"][model_]["output_path"]
 93 |     
 94 |     if fq == False : 
 95 |         if model_ == "few_shot":
 96 |             result = few_shot(input_path, output_path )
 97 | 
 98 |         elif model_ == "zero_shot":
 99 |             result = zero_shot(input_path, output_path)
100 | 
101 |         elif model_ == "fine_tuning":
102 |             result = fine_tuning(input_path, output_path)
103 | 
104 |         else:
105 |             print("Unknown model type")
106 |     
107 |     elif fq == True : 
108 |         
109 |         if model_ == "few_shot":
110 |             few_shot(input_path, output_path )
111 |             assistant = FollowQ(output_path, input_path, output_path)  
112 |             assistant.run()
113 |         elif model_ == "zero_shot":
114 |             zero_shot(input_path, output_path)
115 |             assistant = FollowQ(output_path, input_path, output_path)  
116 |             assistant.run()
117 | 
118 |         elif model_ == "fine_tuning":
119 |             fine_tuning(input_path, output_path)
120 |             assistant = FollowQ(output_path, input_path, output_path)  
121 |             assistant.run()
122 |     
123 | 


--------------------------------------------------------------------------------
/table_representation/__pycache__/table2json_upload.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/table_representation/__pycache__/table2json_upload.cpython-37.pyc


--------------------------------------------------------------------------------
/table_representation/__pycache__/table_representer_upload.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/table_representation/__pycache__/table_representer_upload.cpython-37.pyc


--------------------------------------------------------------------------------
/table_representation/table2json.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import pandas as pd
  3 | import json
  4 | import os
  5 | from bs4 import NavigableString
  6 | from collections import OrderedDict
  7 | 
  8 | 
  9 | class TableProcessor :
 10 |     def __init__(self, json_path) :
 11 |         self.json_path = json_path
 12 | 
 13 |     def load_table(self):
 14 |         '''
 15 |         load the table in format
 16 |         '''
 17 |         
 18 |         with open(self.json_path, 'r', encoding = 'utf-8') as file:
 19 |             data = json.load(file)
 20 |             
 21 |         title = data['title']
 22 |         caption = data['caption']
 23 |         table_tag = data["tag"]
 24 |         soup = BeautifulSoup(table_tag, 'html.parser')
 25 |         return soup, title, caption
 26 |     
 27 |     def caption_process(self):
 28 |         '''
 29 |         finding caption and ref data in html table, 
 30 |         giving caption data <cap> </cap> tag,
 31 |         giving reference data <ref> </ref> tag  
 32 |         '''
 33 |         
 34 |         table, _, _ = self.load_table()
 35 |         
 36 |         for tfoot in table.find_all('tfoot'):
 37 |             tfoot.decompose()
 38 |             
 39 |         td_elements = table.find_all('td')
 40 |         th_elements = table.find_all('th')
 41 |         
 42 |         for th in th_elements:
 43 |             link = th.find('a')
 44 |             if link:
 45 |                 link_text = link.get_text()
 46 |                 if len(link_text) == 1 and link_text.isalpha() or link_text == '*':
 47 |                     link.string = "<cap>" + link_text + "</cap>"
 48 |                 elif len(link_text) == 1 and link_text == '*':
 49 |                     link.string = "<cap>" + link_text + "</cap>"
 50 |                 else :
 51 |                     link.string = "<ref>" + link_text + "</ref>"
 52 | 
 53 |         for td in td_elements:
 54 |             link = td.find_all('a')
 55 | 
 56 |             if len(link) == 1:
 57 |                 linktext = link[0]
 58 |                 link_text = linktext.get_text()
 59 |                 if len(link_text) == 1 and link_text.isalpha():
 60 |                     link[0].string = "<cap>" + link_text + "</cap>"
 61 |                 elif len(link_text) == 1 and link_text == '*':
 62 |                     link[0].string = "<cap>" + link_text + "</cap>"
 63 |                 else : 
 64 |                     link[0].string = "<ref>" + link_text + "</ref>"
 65 |           
 66 |             elif len(link) > 1 :
 67 |                 link_string = []
 68 |                 for i in link : 
 69 |                     link_str = i.get_text()
 70 |                     link_string.append(link_str)                    
 71 |                 combined_string = ','.join(link_string)           
 72 |                 link[0].string = "<ref>" + combined_string + "</ref>"
 73 |                 for j in (1, len(link)-1) :
 74 |                     link[j].string = ''
 75 | 
 76 |         return table
 77 |       
 78 |     def supb_process(self):
 79 |         '''
 80 |         finding sub and sup in html table, 
 81 |         giving sub <sub> </sub> tag,
 82 |         giving sup <sup> </sup> tag  
 83 |         '''
 84 |         
 85 |         table = self.caption_process()
 86 |         for i_tag in table.find_all('i'):
 87 |             i_tag.unwrap()
 88 |         td_elements = table.find_all('td')
 89 |         th_elements = table.find_all('th')
 90 |         for th in th_elements:
 91 |             sup_ = th.find_all('sup')
 92 |             sub_ = th.find_all('sub')
 93 |             if sup_:
 94 |                 for q in sup_:
 95 |                     sup_text = q.get_text() if q.get_text() else ""
 96 |                     q.string = "<sup>" + sup_text + "</sup>"
 97 |             if sub_:
 98 |                 for e in sub_ :
 99 |                     sub_text = e.get_text() if e.get_text() else ""
100 |                     e.string = "<sub>" + sub_text + "</sub>"
101 |         for td in td_elements:
102 |             sup = td.find_all('sup')
103 |             sub = td.find_all('sub')
104 |             if sup:
105 |                 for b in sup : 
106 |                     sup_text = b.get_text() if b.get_text() else ""
107 |                     b.string = "<sup>" + sup_text + "</sup>"
108 |             if sub:
109 |                 for a in sub:
110 |                     sub_text = a.get_text() if a.get_text() else ""
111 |                     a.string = "<sub>" + sub_text + "</sub>"
112 |             
113 |         return table
114 | 
115 |     def header_process(self):
116 |         '''
117 |         Filling empty cells in the header with '-'.
118 |         '''
119 |         
120 |         table = self.supb_process()
121 |         th_elements = table.find_all('th')
122 |         for th in th_elements :
123 |             if not th.text.strip() :
124 |                 th.insert(0, '-')
125 |             th['align'] = 'left'
126 |         
127 |         return table
128 |     
129 |     def body_process(self):
130 |         '''
131 |         Copy the first cell of the previous row if the first cell is empty.
132 |         '''
133 | 
134 |         table = self.header_process()
135 |         has_empty_cells = False
136 |         prev_value = None
137 |         for row in table.find_all('tr'):
138 |             first_cell = row.find('td') 
139 |             if first_cell:
140 |                 cell_text = first_cell.text.strip()
141 |                 if cell_text == '' and prev_value:
142 |                     first_cell.string = prev_value
143 |                 if cell_text == '':
144 |                     has_empty_cells = True
145 |                 prev_value = cell_text 
146 | 
147 |         return table
148 | 
149 |     def convert_to_dataframe(self):
150 |         '''
151 |         Conveert the html table to dataframe
152 |         '''
153 |         
154 |         table = self.body_process()
155 |         dfs = pd.read_html(str(table))
156 |         df_table = dfs[0]
157 |         df_table.fillna("NaN", inplace=True) 
158 |         
159 |         return df_table
160 | 
161 |     def convert_to_json(self, table_name, save_directory):
162 |         '''
163 |         Convert dataframe to json.
164 |         '''
165 |         _, title, caption = self.load_table()
166 |         table_name = table_name.split('.')[0]
167 |         name_element = table_name.split('_')
168 |         df_for_json = self.convert_to_dataframe()
169 |         header_row = df_for_json.columns.nlevels
170 |         df_for_json_key = list(df_for_json.columns)
171 |         num_columns = df_for_json.shape[1]
172 | 
173 |         key_list = []
174 |         value_list = []
175 |         for i in range(0, num_columns):
176 |             key_list.append(df_for_json_key[i])
177 |             value_list.append(df_for_json.iloc[:, i].tolist())
178 | 
179 |         result = {}
180 |         if header_row > 1:
181 |             for i, keys in enumerate(key_list):
182 |                 current_dict = result
183 |                 for j, key in enumerate(keys):
184 |                     if key not in current_dict:
185 |                         current_dict[key] = {}
186 |                     if j == len(keys) - 1:
187 |                         current_dict[key] = value_list[i]
188 |                     current_dict = current_dict[key]
189 |         elif header_row == 1 : 
190 |             for i, keys in enumerate(key_list):
191 |                 current_dict = result
192 | 
193 |                 current_dict[keys] = value_list[i]
194 |                 
195 |         # try : 
196 |         key_to_extract = caption
197 |         title_to_extract = title
198 |         key_to_extract = {
199 |             
200 |             "caption": key_to_extract
201 |         }
202 |         title_to_extract = {
203 |             
204 |             "Title": title_to_extract
205 |         }
206 | 
207 |         result.update(key_to_extract)
208 |         title_to_extract.update(result)
209 |         save_directory_ = save_directory + '/' + table_name + '.json'
210 |         
211 |         with open(save_directory_, 'w', encoding='utf-8') as f:
212 |             json.dump(title_to_extract, f, indent=4)
213 | 
214 | 
215 | # if __name__ == '__main__':
216 | #     json_path = 'Z:/NLP Project/table/code_upload/data/split/table_split_json'
217 | #     json_file_list = os.listdir(json_path)
218 | #     save_directory = 'Z:/NLP Project/table/code_upload/data/split/json_representation'
219 |     
220 | 
221 | #     for i in json_file_list:
222 | #         a = i.split('.')[0]
223 | #         table_processor = TableProcessor(json_path + a + '.json')
224 | #         table_processor.convert_to_json(i, save_directory)
225 |     
226 |     
227 | 


--------------------------------------------------------------------------------
/table_representation/table_representer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from bs4 import BeautifulSoup
  4 | import re
  5 | 
  6 | class TableRepresenter:
  7 |     def __init__(self, table_path):
  8 |         self.table_path = table_path
  9 |         self.table_list = os.listdir(self.table_path)
 10 | 
 11 |         # Initialize cell representation strings
 12 |         self.merged_cell = '<merge {}={}>{}</merge>'
 13 |         self.both_merged_cell = '<merge {}={} {}={}>{}</merge>'
 14 |         self.cell = '{}\\t'
 15 |         self.line_breaking = '\\n'
 16 |         self.table_tag = '<table>{}</table>'
 17 |         self.caption_tag = '<caption>{}</caption>'
 18 |         self.title_tag = '<title>{}</title>'
 19 | 
 20 |     def text_filter(self, out):
 21 |         """
 22 |         Remove unnecessary text and HTML tags from the given string.
 23 |         """
 24 |         out = re.sub('\\xa0', ' ', out)
 25 |         out = re.sub('\\u2005', ' ', out)
 26 |         out = re.sub('\\u2009', ' ', out)
 27 |         out = re.sub('\\u202f', ' ', out)
 28 |         out = re.sub('\\u200b', '', out)
 29 |         out = re.sub('<b>', '', out)
 30 |         out = re.sub('</b>', '', out)
 31 |         
 32 |         # Remove or replace specific patterns
 33 |         patterns = [
 34 |             (r'<cap>(\(\d+\)|\d+|\[\d+\]|\d+\,\d+|\d+\,\d+\,\d+|\d+\,\d+\–\d+|\d+\D+|\(\d+\,\s*\d+\)|\(\d+\D+\))</cap>', r'\1'),
 35 |             (r'<cap>(\s*ref\.\s\d+.*?)</cap>', r'\1'),
 36 |             (r'\(<cap>(\s*(ref\.\s\d+.*?)\s*)</cap>\)', r'\1'),
 37 |             (r'<cap>(\s*Ref\.\s\d+.*?)</cap>', r'\1'),
 38 |             (r'\(<cap>(\s*(Ref\.\s\d+.*?)\s*)</cap>\)', r'\1'),
 39 |             (r'<cap>(\[\d+|\d+\])</cap>', r'\1'),
 40 |             (r'<cap>((.*?)et al\..*?)</cap>', r'\1'),
 41 |             (r'<cap>((.*?)Fig\..*?)</cap>', r'\1'),
 42 |             (r'<cap>(Song and Hu \(2014\))</cap>', r'\1'),
 43 |             (r'<div> <cap>  </cap> </div> ', '',),
 44 |             (r'<cap>(mA\.cm)</cap>', r'\1'),
 45 |             (r'<cap>(https.*?)</cap>', r'\1'),
 46 |             (r'<cap>(\d+\.\d+\@\d+)</cap>', r'\1')
 47 |         ]
 48 |         
 49 |         for pattern, repl in patterns:
 50 |             out = re.sub(pattern, repl, out)
 51 |         
 52 |         return out
 53 | 
 54 |     def caption_process(self, caption):
 55 |         """
 56 |         Process the caption text and extract key-value pairs.
 57 |         """
 58 |         pattern = r'(\w+): (.*?)(?:;|$)'
 59 |         matches = re.findall(pattern, caption)
 60 |         result_dict = {key.strip(): value.strip() for key, value in matches}
 61 |         print(result_dict)
 62 | 
 63 |     def load_data(self, file_name):
 64 |         """
 65 |         Load JSON data from the specified file.
 66 |         """
 67 |         file_path = os.path.join(self.table_path, file_name)
 68 |         with open(file_path, 'r', encoding='utf-8-sig') as file:
 69 |             data = json.load(file)
 70 |         return data
 71 | 
 72 |     def process_table(self, t):
 73 |         """
 74 |         Remove unnecessary HTML tags from the table element.
 75 |         """
 76 |         tags_to_remove = ['img', 'em', 'i', 'p', 'span', 'strong', 'math', 'mi', 'br', 'script', 'svg', 'mrow', 'mo', 'mn', 'msub', 'msubsup', 'mtext', 'mjx-container', 'mjx-math', 'mjx-mrow', 'mjx-msub', 'mjx-mi', 'mjx-c', 'mjx-script', 'mjx-mspace', 'mjx-assistive-mml', 'mspace']
 77 |         
 78 |         for tag in tags_to_remove:
 79 |             elements = t.find_all(tag)
 80 |             for element in elements:
 81 |                 if tag in ['img', 'script', 'svg']:
 82 |                     element.decompose()
 83 |                 else:
 84 |                     element.unwrap()
 85 |         
 86 |         return t
 87 | 
 88 |     def make_table_representer(self, table_representer, table_element, head=None):
 89 |         """
 90 |         Create a table representation with the appropriate formatting.
 91 |         """
 92 |         out = [['' for _ in range(self.width)] for _ in range(self.height if head is None else len(table_element.find_all('tr')))]
 93 |         
 94 |         i = 0
 95 |         for tr in table_element.find_all('tr'):
 96 |             j = 0
 97 |             for t in tr.find_all(re.compile('(?<!ma)th|td')):
 98 |                 for sub_tag in t.find_all('sub'):
 99 |                     for strong_tag in sub_tag.find_all('strong'):
100 |                         strong_tag.unwrap()
101 |                 for a_tag in t.find_all('a'):
102 |                     a_text = a_tag.get_text()
103 |                     if a_text.isdigit():
104 |                         ref_tag = BeautifulSoup().new_tag('ref')
105 |                         ref_tag.string = a_text
106 |                         a_tag.replace_with(ref_tag)
107 |                     else:
108 |                         cap_tag = BeautifulSoup().new_tag('cap')
109 |                         cap_tag.string = a_text
110 |                         a_tag.replace_with(cap_tag)
111 | 
112 |                 if t.find('math'):
113 |                     t.find('math').unwrap()
114 |                 t = self.process_table(t)
115 | 
116 |                 while out[i][j] != '': 
117 |                     j += 1
118 | 
119 |                 refined_text = ''.join(str(element) for element in t.contents)
120 |                 colspan = int(t.get('colspan', 0))
121 |                 rowspan = int(t.get('rowspan', 0))
122 | 
123 |                 if colspan and rowspan:
124 |                     for c in range(i, i + colspan):
125 |                         for r in range(j, j + rowspan):
126 |                             out[c][r] = '::'
127 |                     out[i][j] = self.both_merged_cell.format('colspan', colspan, 'rowspan', rowspan, self.text_filter(refined_text))
128 |                 elif colspan:
129 |                     out[i][j] = self.merged_cell.format('colspan', colspan, self.text_filter(refined_text))
130 |                 elif rowspan:
131 |                     out[i][j] = self.merged_cell.format('rowspan', rowspan, self.text_filter(refined_text))
132 |                 else:
133 |                     if not t.contents and not t.string:
134 |                         t.contents = [' ']
135 |                     out[i][j] = self.text_filter(refined_text)
136 | 
137 |                 if colspan:
138 |                     for c in range(colspan - 1):
139 |                         out[i][j + c + 1] = "::"
140 |                 if rowspan:
141 |                     for r in range(rowspan - 1):
142 |                         out[i + r + 1][j] = "::"
143 | 
144 |                 while out[i][j] != '': 
145 |                     if j != self.width - 1:
146 |                         j += 1    
147 |                     else:
148 |                         if head is True and i != len(table_element.find_all('tr')) - 1:
149 |                             i += 1
150 |                             break
151 |                         elif head is False and i != self.height - 2:
152 |                             i += 1
153 |                             break
154 |                         elif head is None and i != self.height - 1:
155 |                             i += 1
156 |                         else:
157 |                             break
158 |         return out      
159 | 
160 |     def remove_sup_tags(self, data):
161 |         """
162 |         Remove <sub> and <sup> tags from the table data.
163 |         """
164 |         result = [[item.replace('<sub>', '').replace('</sub>', '') for item in inner_list] for inner_list in data]
165 |         result = [[item.replace('<sup>', '').replace('</sup>', '') for item in inner_list] for inner_list in result]
166 |         return result
167 |     
168 |     def run(self, table, save_directory):  
169 |         cap_table_list = []
170 | 
171 |         final_table_representer = {}
172 |         print(table)
173 | 
174 |         data = self.load_data(table)
175 |         table_tag = data["tag"]
176 |         soup = BeautifulSoup(table_tag, 'html.parser')
177 |         thead = soup.find('thead')
178 |         tbody = soup.find('tbody')
179 | 
180 |         self.width = sum(int(t.get('colspan', 1)) for t in soup.find('tbody').find('tr').find_all(re.compile('(?<!ma)th|td')))
181 |         self.height = len(soup.find_all('tr'))
182 | 
183 |         table_representer = ''
184 |         if thead is None:
185 |             tbody_element = self.make_table_representer(table_representer, tbody)
186 |             final_table_representer['body'] = tbody_element
187 |             table_list = tbody_element
188 |         else:
189 |             thead_element = self.make_table_representer(table_representer, thead, head=True)
190 |             tbody_element = self.make_table_representer(table_representer, tbody, head=False)
191 |             final_table_representer['head'] = thead_element
192 |             final_table_representer['body'] = tbody_element
193 |             table_list = thead_element + tbody_element
194 | 
195 |         if not os.path.exists(save_directory):
196 |             os.makedirs(save_directory)
197 |         
198 |         # Copy the cells above if there are blank spaces in the first column
199 |         for i in range(0, len(table_list)):
200 |             if i != 0:
201 |                 if table_list[i][0] == ' ':
202 | 
203 |                     if "merge" in table_list[i-1][0]:
204 |                         table_list[i][0] = '::'
205 |                     
206 |                     else:
207 |                         table_list[i][0] = table_list[i-1][0]          
208 |         # print(table_list)
209 | 
210 |         for i, rows in enumerate(table_list):
211 |             pattern = r'(\(\d{1,2}\)|\[\d{1,2}\]|\[\d{1,2}\] HER|\(\d{1,2}\)\)|\(this work\)|\(This work\))$'
212 |             rows[0] = re.sub(pattern, r'<ref>\1</ref>', rows[0])
213 |             table_list[i] = rows
214 | 
215 |         result = ''
216 | 
217 |         for table_row in table_list:
218 |             for element in table_row:
219 |                 if element == '::':
220 |                     pass
221 |                 else:
222 |                     result += self.cell.format(element)
223 |             result += self.line_breaking
224 |             
225 |         final_result = self.table_tag.format(result)
226 |         
227 |         caption = data['caption']
228 |         title = data['title']
229 |         
230 |         final_result = self.title_tag.format(title) + final_result
231 | 
232 |         for table_row in table_list:
233 |             for element in table_row:
234 |                 if "<cap>" in element:
235 |                     cap_table_list.append(table)
236 |         
237 |         cap_table_list = list(set(cap_table_list))
238 | 
239 |         if caption:
240 |                 
241 |             final_result += '\n'
242 |             if isinstance(caption, dict):
243 |                 caption_str = ', '.join([f"{key}: {value}" for key, value in caption.items()])
244 |                 final_result += self.caption_tag.format(caption_str)
245 |             else:
246 |                 final_result += self.caption_tag.format(caption)
247 | 
248 |         save_path = os.path.join(save_directory, table[:-5]+'.txt')       
249 |         with open (save_path, 'a', encoding='utf-8-sig') as f:
250 |             f.write(final_result)                                
251 |         
252 | # if __name__ == "__main__": 
253 | #     table_path = 'example_json folder path'
254 | #     save_directory = 'Z:/NLP Project/table/code_upload/data/split/tsv_representation'
255 | #     table_path = 'Z:/NLP Project/table/code_upload/data/split/table_split_json/'
256 | #     table_list = os.listdir(table_path)
257 | #     table = TableRepresenter(table_path) 
258 |     
259 | #     for table_element in table_list:
260 | #         print(table_element)
261 | #         table.run(table_element, save_directory)
262 |     


--------------------------------------------------------------------------------
/table_splitting/__pycache__/split_table_.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KIST-CSRC/MaTableGPT/65968fe63babcf2215b4a97307eb753162161953/table_splitting/__pycache__/split_table_.cpython-37.pyc


--------------------------------------------------------------------------------
/table_splitting/split_table.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import json
  3 | import os
  4 | from bs4 import NavigableString
  5 | import copy
  6 | import pickle
  7 | import nltk
  8 | import re
  9 | 
 10 | class TablePaser:
 11 |     def __init__(self, json_path, table_path, pickle_path):
 12 |         self.json_path = json_path
 13 |         self.table_path = table_path
 14 |         self.pickle_path = pickle_path
 15 |         self.table_list = os.listdir(self.table_path)
 16 |         self.merged_cell = '<merge {}={}>{}</merge>'
 17 |         self.both_merged_cell = '<merge {}={} {}={}>{}</merge>'
 18 |         self.cell = '{}\\t'
 19 |         self.line_breaking = '\\n'
 20 |         self.table_tag = '<table>{}</table>'
 21 |         self.caption_tag = '<caption>{}</caption>'    
 22 |         self.title_tag = '<title>{}</title>'
 23 |         
 24 |     def text_filter(self, out):
 25 |         """
 26 |         Remove unnecessary text and HTML tags from the given string.
 27 |         """
 28 |         out = re.sub('\\xa0', ' ', out)  
 29 |         out = re.sub('\\u2005', ' ', out)
 30 |         out = re.sub('\\u2009', ' ', out)
 31 |         out = re.sub('\\u202f', ' ', out)
 32 |         out = re.sub('\\u200b', '', out)
 33 |         out = re.sub('<b>', '', out)
 34 |         out = re.sub('</b>', '', out)
 35 |         out = re.sub(r'<cap>(\(\d+\)|\d+|\[\d+\]|\d+\,\d+|\d+\,\d+\,\d+|\d+\,\d+\–\d+|\d+\D+|\(\d+\,\s*\d+\)|\(\d+\D+\))</cap>', r'\1', out) 
 36 |         out = re.sub(r'<cap>(\s*ref\.\s\d+.*?)</cap>', r'\1', out)
 37 |         out = re.sub(r'\(<cap>(\s*(ref\.\s\d+.*?)\s*)</cap>\)', r'\1', out)
 38 |         out = re.sub(r'<cap>(\s*Ref\.\s\d+.*?)</cap>', r'\1', out)
 39 |         out = re.sub(r'\(<cap>(\s*(Ref\.\s\d+.*?)\s*)</cap>\)', r'\1', out)
 40 |         out = re.sub(r'<cap>(\[\d+|\d+\])</cap>', r'\1', out)
 41 |         out = re.sub(r'<cap>((.*?)et al\..*?)</cap>', r'\1', out)
 42 |         out = re.sub(r'<cap>((.*?)Fig\..*?)</cap>', r'\1', out)
 43 |         out = re.sub(r'<cap>(Song and Hu \(2014\))</cap>', r'\1', out)
 44 |         out = re.sub(r'<div> <cap>  </cap> </div> ', '', out)
 45 |         out = re.sub(r'<cap>(mA\.cm)</cap>', r'\1', out)
 46 |         out = re.sub(r'<cap>(https.*?)</cap>', r'\1', out)
 47 |         out = re.sub(r'<cap>(\d+\.\d+\@\d+)</cap>', r'\1', out)
 48 |         out = re.sub(r'\[<ref>(\d+)</ref>\]','['+r'\1'+']', out)
 49 |         return out   
 50 | 
 51 |     def metadata(self, file_name):
 52 |         file_name_parts = file_name.split('.')[0]
 53 |         json_file_name = file_name_parts + '.json'
 54 |         file_path = os.path.join(self.json_path, json_file_name)
 55 |         
 56 |         with open(file_path, 'r', encoding='utf-8') as file:
 57 |             metadata = json.load(file)
 58 | 
 59 |         return file_name, metadata
 60 |     
 61 |     def process_table(self, t):
 62 |         """
 63 |         Remove unnecessary HTML tags from the table element.
 64 |         """
 65 |         for tag in ['img', 'em', 'i', 'p', 'span', 'strong', 'math', 'mi', 'script', 'svg', 'mrow', 'mo', 'mn', 'br', 'msub', 'msubsup', 'mtext', 'mjx-container', 'mjx-math', 'mjx-mrow', 'mjx-msub', 'mjx-mi', 'mjx-c', 'mjx-script', 'mjx-mspace', 'mjx-assistive-mml', 'mspace']:
 66 |             if t.find(tag):
 67 |                 if tag == 'em' or tag == 'i' or tag == 'p' or tag == 'span' or tag == 'strong' or tag == 'mi' or tag == 'mrow'or tag == 'mo'or tag == 'mn' or tag == 'br' or tag =='msub' or tag == 'msubsup' or tag =='mtext'or tag =='mjx-container' or tag == 'mjx-math' or tag =='mjx-mrow' or tag=='mjx-msub' or tag =='mjx-mi' or tag == 'mjx-c' or tag=='mjx-script' or tag =='mjx-mspace' or tag=='mjx-assistive-mml' or tag=='mspace':
 68 |                     for strong_tag in t.find_all(tag):
 69 |                         strong_tag.unwrap()
 70 |                 elif tag == 'img' or tag == 'script' or tag == 'svg':
 71 |                     img_tag = t.find(tag)
 72 |                     img_tag.decompose()           
 73 |         return t
 74 |     
 75 |     def make_table_representer(self, table_representer, table_element, head=None):
 76 |         """
 77 |         Create a table representation with the appropriate formatting.
 78 |         """
 79 |         out = []
 80 |         if head == True:
 81 |             head_height = len(table_element.find_all('tr'))  
 82 |             for i in range(head_height):
 83 |                 out.append([])
 84 |                 for j in range(self.width):
 85 |                     out[i].append('')   
 86 |         elif head == False:
 87 |             body_height = len(table_element.find_all('tr')) 
 88 |             for i in range(body_height):
 89 |                 out.append([])
 90 |                 for j in range(self.width):
 91 |                     out[i].append('') 
 92 |         else:  
 93 |             for i in range(self.height):
 94 |                 out.append([])
 95 |                 for j in range(self.width):
 96 |                     out[i].append('')           
 97 |         i = 0    
 98 | 
 99 |         for tr in table_element.find_all('tr'):
100 |             j = 0
101 | 
102 |             for t in tr.find_all(re.compile('(?<!ma)th|td')):
103 |                 for sub_tag in t.find_all('sub'):
104 |                     for strong_tag in sub_tag.find_all('strong'):
105 |                         strong_tag.unwrap()
106 |                 for a_tag in t.find_all('a'):
107 |                     for a_tag in t.find_all('a'):
108 |                         a_text = a_tag.get_text()
109 |                         if a_text.isdigit():
110 |                             ref_tag = BeautifulSoup().new_tag('ref')
111 |                             ref_tag.string = a_text
112 |                             a_tag.replace_with(ref_tag)
113 |                         else:    
114 |                             cap_tag = BeautifulSoup().new_tag('cap')
115 |                             cap_tag.string = a_text
116 |                             a_tag.replace_with(cap_tag)
117 |                 if t.find('math'):        
118 |                     t.find('math').unwrap()
119 |                 t = self.process_table(t)    
120 |                 while(out[i][j] != ''): j += 1
121 |                 
122 |                 if t.get('colspan') and t.get('rowspan'):
123 |                     refined_text = ''.join(str(element) for element in t.contents)
124 |                     colspan = int(t.get('colspan'))
125 |                     rowspan = int(t.get('rowspan'))
126 |                     for c in range(i, i + colspan):
127 |                         for r in range(j, j + rowspan):
128 |                             out[c][r] = self.text_filter(refined_text)                 
129 |                     out[i][j] = self.text_filter(refined_text)           
130 |                 elif t.get('colspan'):
131 |                     refined_text = ''.join(str(element) for element in t.contents)
132 |                     colspan = int(t.get('colspan'))
133 |                     rowspan = 0
134 |                     out[i][j] = self.text_filter(refined_text)
135 |                 elif t.get('rowspan'):
136 |                     refined_text = ''.join(str(element) for element in t.contents)
137 |                     rowspan = int(t.get('rowspan'))
138 |                     colspan = 0
139 |                     out[i][j] = self.text_filter(refined_text)
140 |                 else:
141 |                     colspan = 0
142 |                     rowspan = 0
143 |                     if not t.contents and not t.string:
144 |                         t.contents = [' ']
145 |                     refined_text = ''.join(str(element) for element in t.contents)
146 |                     out[i][j] = self.text_filter(refined_text)
147 | 
148 |                 try:
149 |                     if colspan != 0:
150 |                         for c in range(colspan-1):
151 |                             out[i][j+c+1] = out[i][j]
152 |                     if rowspan!= 0:    
153 |                         for r in range(rowspan-1):
154 |                             out[i+r+1][j] = out[i][j]
155 | 
156 |                 except:
157 |                     pass
158 |                 while(out[i][j] != ''): 
159 |                     if j != self.width-1:
160 |                         j += 1    
161 |                     else:
162 |                         if head == True:
163 |                             if i != head_height-1:
164 |                                 i += 1
165 |                                 break
166 | 
167 |                             else:
168 |                                 break
169 |                         elif head == False:
170 |                             if i != self.height-2:
171 |                                 i += 1
172 |                                 break
173 | 
174 |                             else:
175 |                                 break
176 |                         else:
177 |                             if i != self.height-1:
178 |                                 i += 1
179 |                             else:
180 |                                 break
181 |         return out      
182 |     def remove_sup_tags(self, data):
183 |         """
184 |         Remove <sub> and <sup> tags from the table data.
185 |         """
186 |         result = [[item.replace('<sub>', '').replace('</sub>', '') for item in inner_list] for inner_list in data]
187 |         result = [[item.replace('<sup>', '').replace('</sup>', '') for item in inner_list] for inner_list in result]
188 |         return   result
189 |     
190 |     def run(self):  
191 |         cap_table_list = []
192 |         cap_table_dict = {}
193 |         for table in self.table_list:
194 |             final_table_representer = {}
195 | 
196 |             file_name, metadata = self.metadata(table)
197 |             table_title = metadata.get('title', '')
198 |             table_caption = metadata.get('caption', '')
199 |             table_tag = metadata.get('tag', '')
200 |             soup = BeautifulSoup(table_tag, 'html.parser')
201 |             thead = soup.find('thead') if soup.find('thead') else None
202 |             tbody = soup.find('tbody') if soup.find('tbody') else None
203 | 
204 |             if len(soup.find_all()) == 0:
205 |                 continue
206 |             self.width = 0
207 |             for t in soup.find('tbody').find('tr').find_all(re.compile('(?<!ma)th|td')):
208 |                 if t.get('colspan'):
209 |                     self.width += int(t.get('colspan'))
210 |                 else:
211 |                     self.width += 1
212 | 
213 |             self.height = len(soup.find_all('tr'))  
214 |             table_representer = ''
215 | 
216 |             tbody_element = self.make_table_representer(table_representer, tbody)
217 | 
218 |             final_table_representer['body'] = tbody_element
219 |             table_list = tbody_element
220 | 
221 |             directory = './table_representer'
222 |             if not os.path.exists(directory):
223 |                 os.makedirs(directory)
224 |                         for i in range(0, len(table_list)):
225 |                 if i != 0:
226 |                     if table_list[i][0] == ' ':
227 |                         if "merge" in table_list[i-1][0]:
228 |                             table_list[i][0] = '::'
229 |                         
230 |                         else:
231 |                             table_list[i][0] = table_list[i-1][0]
232 |                             
233 |             
234 |             if not os.path.exists(self.pickle_path):
235 |                 os.makedirs(pickle_directory)                            
236 |             save_path = os.path.join(self.pickle_path, file_name[:-5]+'.pickle')       
237 |           
238 |             with open(save_path, 'wb') as f:
239 |                 pickle.dump(table_list, f) 
240 |         return table_list
241 | 
242 |     
243 | class DivideHtml() : 
244 |     def __init__(self, html_path, pickle_path, save_path) : 
245 |         self.html_path = html_path
246 |         self.pickle_path = pickle_path
247 |         self.save_path = save_path
248 |         
249 |     def load_pickle(self) : 
250 |         '''
251 |         load pickle data in ./data/pickle_folder
252 |         '''
253 |         with open(self.pickle_path, 'rb') as file:
254 |             data = pickle.load(file)
255 |             
256 |         empty_row = []
257 |         for row in data : 
258 | 
259 |             if all(element == '' for element in row):
260 |                 empty_row.append(row)
261 |             if all(element == ' ' for element in row):
262 |                 empty_row.append(row)
263 |             if all(element == '  ' for element in row):
264 |                 empty_row.append(row)
265 |             if all(element == '   ' for element in row):
266 |                 empty_row.append(row)
267 |             if all(element == ' \u2006' for element in row) : 
268 |                 empty_row.append(row)     
269 |             if all(element == '&nbsp;' for element in row) : 
270 |                 empty_row.append(row)   
271 |             if all(element == '&thinsp;' for element in row) : 
272 |                 empty_row.append(row)
273 |                 
274 |         for r in empty_row : 
275 |             data.remove(r)
276 |         return data
277 | 
278 |     def head_tag(self) : 
279 |         '''
280 |         Finding head in html
281 |         '''
282 |         with open(self.html_path, 'r', encoding = 'utf-8-sig') as file : 
283 |             html = file.read()
284 |         soup = BeautifulSoup(html, 'html.parser')
285 |         table = soup.find('table')
286 | 
287 |         head_list =[]
288 |         head_element = table.find_all('thead')
289 | 
290 |         if len(head_element) == 0 : 
291 |             return None
292 |         else : 
293 |             head = head_element[0]
294 |             return head
295 |     
296 |     def head_body_decision_making(self) :
297 |         '''
298 |         Finding sub-header in body
299 |         '''
300 |         data = self.load_pickle()
301 |             
302 |         row_h_b_list = []
303 |     
304 |         for row in data :
305 |             if len(list(set(row))) == 1 :  
306 |                 if len(row) >1 : 
307 |                     row_h_b_list.append('head')
308 |                 
309 |                 else : pass
310 |             
311 |             else : 
312 |                 td_list = []
313 |                 for td in row : 
314 |                     if td.lower() == 'empty cell' : 
315 |                         td = td.replace('empty cell', ' ')
316 | 
317 |                     if td == '-' : 
318 |                         td = td.replace('-', '0')
319 |                     if td == '—' : 
320 |                         td = td.replace('—', '0')
321 |                     if td == '–' : 
322 |                         td = td.replace('–', '0')
323 |                     if td == '--' : 
324 |                         td = td.replace('--', '0')
325 |                     if td == '---' : 
326 |                         td = td.replace('---', '0')                   
327 |                     if td == '----' : 
328 |                         td = td.replace('----', '0') 
329 |                     if td == '' : 
330 |                         td = td.replace('', '0')               
331 |                     if td == ' ' : 
332 |                         td = td.replace(' ', '0')      
333 |                     if 'work' in td : 
334 |                         td = '0 ' + td   
335 |                     if 'et al' in td : 
336 |                         td = '0 ' + td   
337 |                                                                         
338 |                     tokens = nltk.word_tokenize(td)
339 |                     for t in tokens : 
340 |                         t.strip()
341 |                         
342 |                     if 'ref' in tokens : 
343 |                         tokens.remove('ref')
344 |                     if 'Ref' in tokens : 
345 |                         tokens.remove('Ref')
346 |                     if '<ref>' in tokens : 
347 |                         tokens.remove('<ref>')
348 |                     if '</ref>' in tokens : 
349 |                         tokens.remove('</ref>')
350 |                     if 'mV' in tokens : 
351 |                         tokens.remove('mV')
352 |                     if 'V'  in tokens : 
353 |                         tokens.remove('V')
354 |                     if '%' in tokens : 
355 |                         tokens.remove('%') 
356 |                     if 'sup' in tokens : 
357 |                         tokens.remove('sup')
358 |                     if '/sup' in tokens : 
359 |                         tokens.remove('/sup')
360 |                     if 'sub' in tokens : 
361 |                         tokens.remove('sub')
362 |                     if '/sub' in tokens : 
363 |                         tokens.remove('/sub')
364 | 
365 |                     cleaned_tokens = []
366 |                     final_tokens = []
367 | 
368 |                     for item in tokens:
369 |                         cleaned_item = ''.join(e for e in item if e.isalnum() or e.isspace())
370 |                         cleaned_tokens.append(cleaned_item)
371 | 
372 |                     for i in cleaned_tokens : 
373 |                         if i != '' : 
374 |                             final_tokens.append(i)
375 | 
376 |                     if final_tokens == [] : 
377 |                         pass
378 |                     
379 |                     else :
380 |                         try : 
381 |                             token_string = final_tokens[0]
382 |                             token_lll = token_string.split()
383 |                             
384 |                             float(token_lll[0]) 
385 |                             td_list.append('b')
386 |                         
387 |                         except : 
388 |                             td_list.append('h')
389 |                 
390 |                 if 'b' in td_list : 
391 |                     row_h_b_list.append('body')
392 |                 
393 |                 else : 
394 |                     row_h_b_list.append('head')     
395 |         row_h_b_list[-1] = 'body'
396 |         return row_h_b_list
397 | 
398 |     def split_list_by_indexes(self, body, index_list): 
399 |         '''
400 |         Split the list by the header in the double-header to create N lists.
401 |         '''
402 |         result = []
403 |         start = 0
404 |         for index in index_list:
405 |             result.append(body[start:index])
406 |             start = index
407 |         result.append(body[start:])
408 |         return result
409 | 
410 |     def case_1(self, origin_head, origin_body, body_h_b_list) : 
411 |         '''
412 |         When the header comes at the top of the body
413 |         '''
414 |         head_index = []
415 |         body_index = []
416 |         for i, decision in enumerate(body_h_b_list) : 
417 |             if decision == 'head' : 
418 |                 head_index.append(i)
419 |             else : 
420 |                 body_index.append(i)
421 |                 
422 |         head = []
423 |         body = []
424 |         
425 |         for h in head_index : 
426 |             head.append(origin_body[h])
427 |         for b in body_index : 
428 |             body.append(origin_body[b])
429 |             
430 |         for i, row in enumerate(head) : 
431 |             formatted_list = [f'<td>{item}</td>' for item in row]
432 |             result_head = ' '.join(formatted_list)
433 |             result_head = '<tr> ' + result_head + ' </tr>'
434 |             
435 |         if origin_head == None : 
436 |             modified_head = '<thead>' + result_head + '</thead>'
437 |         
438 |         else : 
439 |             origin_head = str(origin_head)
440 |             modified_head = origin_head.replace('</thead>', '')
441 |             modified_head = modified_head + result_head + '</thead>'     
442 | 
443 |         for i, row in enumerate(body) : # body html 형식으로 변환
444 |             formatted_list = [f'<td>{item}</td>' for item in row]
445 |             result_body = ' '.join(formatted_list)
446 |             result_body = '<tr> ' + result_body + ' </tr>'
447 |             final_body = '<tbody>' + result_body + '</tbody>' # 행별로 tbody 붙여줌
448 |             
449 |             padded_index = str(i + 1).zfill(2)
450 | 
451 |             html_string = f"<table class='table'><table>{str(modified_head)}{str(final_body)}</table>"
452 |             with open(self.save_path + f"/{file_name}_{padded_index}.html", "w", encoding="utf-8") as file:
453 |                 file.write(html_string)
454 | 
455 |         
456 |     def case_2(self, origin_head, origin_body, body_h_b_list) :
457 |         '''
458 |         When the structure of the header repeats within the body
459 |         '''
460 |         table_seperate_index = []
461 |         
462 |         for i, hb in enumerate(body_h_b_list) : 
463 |             if hb == 'head' : 
464 |                 table_seperate_index.append(i)
465 | 
466 |         table_seperate_index_result = []
467 |         current_sequence = []
468 | 
469 |         for num in table_seperate_index:
470 |             if not current_sequence or num == current_sequence[-1] + 1:
471 |                 current_sequence.append(num)
472 |             else:
473 |                 table_seperate_index_result.append(current_sequence[0])
474 |                 current_sequence = [num]
475 | 
476 |         if current_sequence:
477 |             table_seperate_index_result.append(current_sequence[0])
478 |         
479 |         table_split = self.split_list_by_indexes(origin_body, table_seperate_index_result)
480 |         index_split = self.split_list_by_indexes(body_h_b_list, table_seperate_index_result)
481 |         total_table = []
482 |         for table_num in range(0, len(index_split)) : 
483 | 
484 |             if 'head' not in index_split[table_num] : 
485 |                 body_string_list = []
486 |                 for row_ in table_split[table_num] : 
487 | 
488 |                     for i, row in enumerate(row_) : # head html 형식으로 변환
489 |                         formatted_list = [f'<td>{item}</td>' for item in row_]
490 |                         result_ = ' '.join(formatted_list)
491 |                         result_ = '<tbody> <tr> ' + result_+ ' </tr> </tbody>'
492 |                     body_string_list.append(result_)
493 | 
494 |                 origin_head = str(origin_head)
495 |                 for table_ in body_string_list : 
496 |                     table_ = origin_head + table_
497 |                     total_table.append(table_)
498 |             else : 
499 |                 table_head = []
500 |                 table_body = []
501 |                 
502 |                 for row_index in range(0, len(table_split[table_num])) :
503 | 
504 |                     if index_split[table_num][row_index] == 'head' : 
505 | 
506 |                         formatted_list = [f'<td>{item}</td>' for item in table_split[table_num][row_index]]
507 |                         result_ = ' '.join(formatted_list)
508 |                         result_ = '<tr> ' + result_ + ' </tr>'
509 |                         table_head.append(result_)
510 | 
511 |                     else : 
512 |                         formatted_list = [f'<td>{item}</td>' for item in table_split[table_num][row_index]]
513 |                         result_ = ' '.join(formatted_list)
514 |                         result_ = '<tr> ' + result_ + ' </tr>'
515 |                         table_body.append(result_)
516 |                 
517 |                 table_head = ''.join(table_head)
518 |                 for body_row in table_body : 
519 |                     body_row = '<tbody>' + body_row + '</tbody>'
520 |                     f_table = '<thead>' + table_head + '</thead>' + body_row
521 |                     total_table.append(f_table)
522 | 
523 |         for i, row in enumerate(total_table) : 
524 |             
525 |             padded_index = str(i + 1).zfill(2)
526 | 
527 |             html_string = f"<table class='table'><table>{str(row)}</table>"
528 |             with open(self.save_path + f"/{file_name}_{padded_index}.html", "w", encoding="utf-8") as file:
529 |                 file.write(html_string)
530 |                 
531 |     def case_3(self, origin_head, origin_body, body_h_b_list) : 
532 |         '''
533 |         When the common header comes in the header and the sub-header comes in the body
534 |         '''
535 |         table_seperate_index = []
536 |         
537 |         for i, hb in enumerate(body_h_b_list) : 
538 |             if hb == 'head' : 
539 |                 table_seperate_index.append(i)
540 | 
541 |         table_seperate_index_result = []
542 |         current_sequence = []
543 | 
544 |         for num in table_seperate_index:
545 |             if not current_sequence or num == current_sequence[-1] + 1:
546 |                 current_sequence.append(num)
547 |             else:
548 |                 table_seperate_index_result.append(current_sequence[0])
549 |                 current_sequence = [num]
550 | 
551 |         if current_sequence:
552 |             table_seperate_index_result.append(current_sequence[0])
553 | 
554 |         table_seperate_index_result.pop(0)        
555 |         table_split = self.split_list_by_indexes(origin_body, table_seperate_index_result)
556 |         index_split = self.split_list_by_indexes(body_h_b_list, table_seperate_index_result)
557 | 
558 |         table_final = []
559 |         for split_index in range(0, len(table_split)) : 
560 |             table_ =[]
561 |             for row_index in range(0, len(table_split[split_index])) : 
562 |                 table_head = []
563 |                 table_body = []
564 | 
565 |                 if index_split[split_index][row_index] == 'head' : 
566 | 
567 |                     formatted_list = [f'<td>{item}</td>' for item in table_split[split_index][row_index]]
568 |                     result_ = ' '.join(formatted_list)
569 |                     result_ = '<tr> ' + result_ + ' </tr>'
570 |                     table_head.append(result_)
571 |                         
572 |                 else : 
573 |                     formatted_list = [f'<td>{item}</td>' for item in table_split[split_index][row_index]]
574 |                     result_ = ' '.join(formatted_list)
575 |                     result_ = '<tr> ' + result_ + ' </tr>'
576 |                     table_body.append(result_)
577 |                 
578 |                 if table_head != [] : 
579 |                     table_.append(table_head)
580 |                 else : 
581 |                     table_.append(table_body)
582 |                 
583 |             head_index = list(filter(lambda x: index_split[split_index][x] == 'head', range(len([split_index]))))
584 |             body_index = []
585 |             for i in range(0, len(index_split[split_index])) : 
586 |                 if i not in head_index : 
587 |                     body_index.append(i)
588 |             
589 |             head_string = ''
590 |             body_string_list = []
591 |             for head in head_index : 
592 |                 head_string += table_[head][0]
593 |                 
594 |             if origin_head == None : 
595 |                 head_string = '<thead>' + str(head_string) + '</thead>'
596 |             else :
597 |                 origin_head = str(origin_head) 
598 |                 origin_head = origin_head.replace("</thead>", " ")
599 |                 head_string = origin_head + head_string + '</thead>'
600 |                 
601 |             for body in body_index : 
602 |                 body_string_list.append('<tbody>' + table_[body][0] + '</tbody>')
603 |             
604 |             for fi in body_string_list : 
605 |                 table_final.append(head_string + fi)
606 |         
607 |         
608 |         for i, row in enumerate(table_final) :           
609 |             padded_index = str(i + 1).zfill(2)
610 | 
611 |             html_string = f"<table class='table'><table>{str(row)}</table>"
612 |             with open(self.save_path + f"/{file_name}_{padded_index}.html", "w", encoding="utf-8") as file:
613 |                 file.write(html_string)
614 |                 
615 | 
616 |     def normal(self, origin_head, origin_body) : 
617 |         '''
618 |         Normal table
619 |         '''
620 |         table_final = []
621 |         f_body = []
622 |         origin_head = str(origin_head)
623 |         for row in origin_body : 
624 |             formatted_list = [f'<td>{item}</td>' for item in row]
625 |             result_ = ' '.join(formatted_list)
626 |             result_ = '<tbody> <tr> ' + result_ + ' </tr> </tbody>'
627 |             f_body.append(result_)
628 |         
629 |         for f in f_body : 
630 |             f = origin_head + f
631 |             table_final.append(f)
632 |             
633 |             
634 |         for i, row in enumerate(table_final) :          
635 |             padded_index = str(i + 1).zfill(2)
636 | 
637 |             html_string = f"<table class='table'><table>{str(row)}</table>"
638 |             with open(self.save_path + f"/{file_name}_{padded_index}.html", "w", encoding="utf-8") as file:
639 |                 file.write(html_string)
640 |         
641 |     def run(self) : 
642 |         '''
643 |         Determining the type of table,
644 |         then splitting it according to the rule.
645 |         '''
646 |         f_name = self.html_path.split('/')[-1]
647 |         print(f_name)
648 |         
649 |         origin_body = self.load_pickle()
650 |         origin_head = self.head_tag()
651 |         row_h_b_list = self.head_body_decision_making()
652 |         print(origin_body)
653 |         if all(element == 'body' for element in row_h_b_list) :      
654 |             ######### normal #########
655 |             self.normal(origin_head, origin_body)
656 |             
657 |         elif row_h_b_list.count('head') > 6 : 
658 |             print('outlier')
659 |         
660 |         else :
661 |             if row_h_b_list[0] == 'head' : 
662 |                 last_head = len(row_h_b_list) - row_h_b_list[::-1].index('head') - 1
663 |                 
664 |                 if 'body' not in row_h_b_list : 
665 |                     print('outlier')
666 | 
667 |                 else :     
668 |                     if last_head == row_h_b_list.count('head') - 1:
669 |                         ######### case 1 #########
670 |                         self.case_1(origin_head, origin_body, row_h_b_list)
671 |            
672 |                     else : 
673 |                         ######### case 2 #########
674 |                         if origin_head == None : 
675 |                             self.case_2(origin_head, origin_body, row_h_b_list)
676 |                         ######### case 3 #########
677 |                         else : 
678 |                             self.case_3(origin_head, origin_body, row_h_b_list)
679 |             else : 
680 |                 ######### case 2 #########
681 |                 self.case_2(origin_head, origin_body, row_h_b_list)
682 | 


--------------------------------------------------------------------------------