├── README.md ├── cli ├── __init__.py ├── downloader.py ├── parser.py ├── pipeline.py └── reader.py ├── config ├── LANGS_322.tsv ├── TAGS_HTML_HEADERS.tsv ├── __init__.py └── config.py ├── core ├── __init__.py ├── downloader.py ├── parse_wikitable_html.py ├── utils │ ├── __init__.py │ └── io_worker.py └── wikitable_to_image.py ├── data └── dump │ └── crwiki-NS0-20220301-ENTERPRISE-HTML.json.tar.gz ├── requirements.txt ├── run.py └── wtabhtml.py /README.md: -------------------------------------------------------------------------------- 1 | # WTabHTML: HTML Wikitables extractor 2 | 3 | ### Input: 4 | - Wikipedia HTML dump 5 | - Language 6 | 7 | ### Output: 8 | File format: JSON list. Each line is a json object of 9 | ``` 10 | { 11 | title: wikipedia title 12 | wikidata: wikidata ID 13 | url: the url that link to Wikipedia page 14 | index: the index of table in the Wikipedia page 15 | html: html content of table 16 | caption: table caption 17 | aspects: (Hierachy sections of Wikipedia) 18 | } 19 | ``` 20 | 21 | ### Usage: 22 | #### Download, Extract, and dump wikitables in CR language 23 | ```shell 24 | python wtabhtml.py dump -l cr 25 | ``` 26 | 27 | #### Download, Extract, dump wikitables, and generate table images in CR language 28 | 29 | ```shell 30 | python wtabhtml.py gen-images -l cr -n 3 31 | ``` 32 | Note: User can download our [preprocessed dumps](https://drive.google.com/drive/folders/1wU5zdHcb3egxpwyluZCqVBIZnSanUwqN?usp=sharing) then, copy all {LANGUAGE}.jsonl.bz2 (the wikitables dump in PubTabNet format) to `wtabhtml/data/models/wikitables_html_pubtabnet` to generate photo images faster. 33 | 34 | 35 | If user want to re-run all pipeline, the tool will download Wikipedia HTML dump, extract wikitables, and dump it to `wtabhtml/data/models/wikitables_html_pubtabnet\{LANGUAGE}.jsonl.bz2` file as the following pipeline. 36 | 37 | #### Pipeline of Wikitable processing in cr language 38 | ```shell 39 | # Download dump 40 | python wtabhtml.py download -l cr 41 | # Parse dump and save json file 42 | python wtabhtml.py parse -l cr 43 | # Read dump 44 | python wtabhtml.py read -l 1 -i ./data/models/cr.jsonl.bz2 45 | # Generate images 46 | python wtabhtml.py gen-images -l cr -n 3 47 | ``` 48 | 49 | ### Contact 50 | Phuc Nguyen (`phucnt@nii.ac.jp`) 51 | -------------------------------------------------------------------------------- /cli/__init__.py: -------------------------------------------------------------------------------- 1 | import click 2 | from cli import downloader, parser, reader, pipeline 3 | 4 | cli_wikitables = click.CommandCollection( 5 | sources=[ 6 | parser.cli_parser, 7 | reader.cli_reader, 8 | downloader.cli_downloader, 9 | pipeline.cli_pipeline, 10 | ] 11 | ) 12 | -------------------------------------------------------------------------------- /cli/downloader.py: -------------------------------------------------------------------------------- 1 | import click 2 | from config import config as cf 3 | from core.downloader import download_wikipedia_html_dump 4 | 5 | 6 | @click.group() 7 | def cli_downloader(): 8 | pass 9 | 10 | 11 | @cli_downloader.command() 12 | @click.option( 13 | "-p", 14 | "--wikipedia_version", 15 | default=cf.DUMPS_VERSION_WP_HTML, 16 | show_default=True, 17 | help="Version of Wikipedia HTML dump. Find at https://dumps.wikimedia.org/other/enterprise_html/runs/", 18 | ) 19 | @click.option( 20 | "-l", 21 | "--language", 22 | default="ja", 23 | show_default=True, 24 | help="Download the Wikipedia dump of language edition", 25 | ) 26 | def download(wikipedia_version, language): 27 | download_wikipedia_html_dump(wikipedia_version, language) 28 | -------------------------------------------------------------------------------- /cli/parser.py: -------------------------------------------------------------------------------- 1 | import click 2 | from config import config as cf 3 | from core import parse_wikitable_html 4 | 5 | 6 | @click.group() 7 | def cli_parser(): 8 | pass 9 | 10 | 11 | @cli_parser.command() 12 | @click.option( 13 | "-l", 14 | "--language", 15 | default="ja", 16 | show_default=True, 17 | help="Parse dump in the langauge", 18 | ) 19 | @click.option( 20 | "-f", 21 | "--downloaded_file", 22 | default=None, 23 | show_default=True, 24 | help="Directory of the downloaded file (Wikipedia HTML dump)", 25 | ) 26 | @click.option( 27 | "-t", "--limit_table", default=0, show_default=True, help="Save # number of tables", 28 | ) 29 | def parse(language, downloaded_file, limit_table): 30 | parse_wikitable_html.dump_wikitables( 31 | lang=language, input_file=downloaded_file, limit=limit_table 32 | ) 33 | -------------------------------------------------------------------------------- /cli/pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | from contextlib import closing 3 | from multiprocessing import Pool 4 | 5 | import click 6 | from config import config as cf 7 | from core import parse_wikitable_html, downloader 8 | from core.utils import io_worker as iw 9 | from core import wikitable_to_image 10 | 11 | 12 | @click.group() 13 | def cli_pipeline(): 14 | pass 15 | 16 | 17 | def pool_run_dump(args): 18 | wikipedia_version, language = args 19 | downloaded_file = downloader.download_wikipedia_html_dump( 20 | wikipedia_version, language 21 | ) 22 | if not downloaded_file: 23 | return None 24 | dump_file = parse_wikitable_html.dump_wikitables( 25 | lang=language, input_file=downloaded_file, progress=True 26 | ) 27 | return dump_file 28 | 29 | 30 | def run_dump(wikipedia_version, language, n_threads): 31 | if language != "all": 32 | languages = [language] 33 | else: 34 | languages = cf.LANGS 35 | 36 | args = [[wikipedia_version, l] for l in reversed(languages)] 37 | 38 | with closing(Pool(processes=n_threads)) as p: 39 | for i, dump_file in enumerate(p.imap_unordered(pool_run_dump, args)): 40 | if not dump_file: 41 | continue 42 | dump_size = iw.get_size_of_file(os.path.getsize(dump_file)) 43 | print(f"{i + 1}. Dump {language} Saved: {dump_size} - {dump_file}: ") 44 | 45 | 46 | @cli_pipeline.command() 47 | @click.option( 48 | "-p", 49 | "--wikipedia_version", 50 | default=cf.DUMPS_VERSION_WP_HTML, 51 | show_default=True, 52 | help="Version of Wikipedia HTML dump. Find at https://dumps.wikimedia.org/other/enterprise_html/runs/", 53 | ) 54 | @click.option( 55 | "-l", 56 | "--language", 57 | default="all", 58 | show_default=True, 59 | help="Parse the Wikipedia dump of language edition", 60 | ) 61 | @click.option( 62 | "-n", "--n_threads", default=1, show_default=True, help="Run n multiprocessors", 63 | ) 64 | def dump_json(wikipedia_version, language, n_threads): 65 | run_dump(wikipedia_version, language, n_threads) 66 | 67 | 68 | @cli_pipeline.command() 69 | @click.option( 70 | "-p", 71 | "--wikipedia_version", 72 | default=cf.DUMPS_VERSION_WP_HTML, 73 | show_default=True, 74 | help="Version of Wikipedia HTML dump. Find at https://dumps.wikimedia.org/other/enterprise_html/runs/", 75 | ) 76 | @click.option( 77 | "-l", 78 | "--language", 79 | default="all", 80 | show_default=True, 81 | help="Parse the Wikipedia dump of language edition", 82 | ) 83 | @click.option( 84 | "-n", "--n_threads", default=1, show_default=True, help="Run n multiprocessors", 85 | ) 86 | @click.option( 87 | "-c", 88 | "--compress", 89 | default=False, 90 | show_default=True, 91 | help="Compress the output dataset or not", 92 | ) 93 | @click.option( 94 | "-d", 95 | "--delete_org", 96 | default=False, 97 | show_default=True, 98 | help="Delete the original folder after compressing", 99 | ) 100 | def gen_images(wikipedia_version, language, n_threads, compress, delete_org): 101 | if language != "all": 102 | languages = [language] 103 | else: 104 | languages = cf.LANGS 105 | 106 | iw.print_status(f"No\tLang\tImages\tErrors\tRunTime") 107 | for i, language in enumerate(reversed(languages)): 108 | n_errors, n_images, run_time = wikitable_to_image.gen_images( 109 | wikipedia_version=wikipedia_version, 110 | lang=language, 111 | n_threads=n_threads, 112 | compress=compress, 113 | delete_org=delete_org, 114 | ) 115 | iw.print_status( 116 | f"{i + 1}\t{language}\t{n_images:,}\t{n_errors:,}\t{run_time:.2f}" 117 | ) 118 | 119 | 120 | if __name__ == "__main__": 121 | run_dump(cf.DUMPS_VERSION_WP_HTML, "all", 1) 122 | -------------------------------------------------------------------------------- /cli/reader.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from core import parse_wikitable_html 4 | 5 | 6 | @click.group() 7 | def cli_reader(): 8 | pass 9 | 10 | 11 | @cli_reader.command() 12 | @click.option( 13 | "--input_file", "-i", help="Read the JSON dump of Wikipedia tables", 14 | ) 15 | @click.option( 16 | "--limit", "-l", default=0, help="Return first limit tables", 17 | ) 18 | def read(input_file, limit): 19 | parse_wikitable_html.read_wikitable_dumps(input_file, limit) 20 | 21 | 22 | @cli_reader.command() 23 | @click.option( 24 | "--input_file", "-i", help="Read the JSON dump of Wikipedia tables", 25 | ) 26 | def size(input_file): 27 | print(parse_wikitable_html.get_jsonl_size(input_file)) 28 | 29 | 30 | @cli_reader.command() 31 | @click.option( 32 | "--input_folder", "-i", help="The folder of Wikitable JSON dumps", 33 | ) 34 | def stats(input_folder): 35 | parse_wikitable_html.read_wikitable_dumps(input_folder) 36 | -------------------------------------------------------------------------------- /config/LANGS_322.tsv: -------------------------------------------------------------------------------- 1 | en 2 | ceb 3 | de 4 | sv 5 | fr 6 | nl 7 | ru 8 | es 9 | it 10 | arz 11 | pl 12 | ja 13 | vi 14 | war 15 | zh 16 | ar 17 | uk 18 | pt 19 | fa 20 | ca 21 | sr 22 | id 23 | ko 24 | no 25 | fi 26 | hu 27 | cs 28 | tr 29 | ce 30 | sh 31 | zh-min-nan 32 | ro 33 | tt 34 | eu 35 | ms 36 | eo 37 | he 38 | hy 39 | bg 40 | da 41 | azb 42 | sk 43 | kk 44 | et 45 | min 46 | be 47 | hr 48 | el 49 | simple 50 | lt 51 | az 52 | gl 53 | sl 54 | ur 55 | nn 56 | ka 57 | hi 58 | th 59 | ta 60 | uz 61 | la 62 | cy 63 | ast 64 | vo 65 | mk 66 | zh-yue 67 | bn 68 | lv 69 | tg 70 | my 71 | af 72 | mg 73 | bs 74 | oc 75 | sq 76 | mr 77 | nds 78 | ky 79 | ml 80 | be-tarask 81 | te 82 | new 83 | br 84 | sw 85 | vec 86 | jv 87 | pms 88 | pnb 89 | ht 90 | su 91 | lb 92 | ba 93 | ga 94 | szl 95 | is 96 | lmo 97 | ku 98 | cv 99 | fy 100 | tl 101 | wuu 102 | an 103 | ckb 104 | sco 105 | diq 106 | pa 107 | yo 108 | ne 109 | bar 110 | io 111 | gu 112 | als 113 | kn 114 | scn 115 | bpy 116 | ia 117 | qu 118 | mn 119 | avk 120 | nv 121 | xmf 122 | si 123 | crh 124 | bat-smg 125 | or 126 | gd 127 | cdo 128 | frr 129 | os 130 | ilo 131 | yi 132 | sd 133 | am 134 | bug 135 | nap 136 | ha 137 | sah 138 | hsb 139 | map-bms 140 | ps 141 | fo 142 | mai 143 | li 144 | mzn 145 | eml 146 | gor 147 | ban 148 | ace 149 | lld 150 | bcl 151 | sa 152 | wa 153 | zh-classical 154 | lij 155 | shn 156 | zu 157 | mrj 158 | mhr 159 | hif 160 | as 161 | mni 162 | hyw 163 | hak 164 | roa-tara 165 | pam 166 | km 167 | ie 168 | nso 169 | rue 170 | so 171 | bh 172 | sn 173 | se 174 | vls 175 | nds-nl 176 | myv 177 | sat 178 | mi 179 | nah 180 | sc 181 | vep 182 | gan 183 | glk 184 | kab 185 | tk 186 | fiu-vro 187 | co 188 | bo 189 | ab 190 | kv 191 | frp 192 | csb 193 | pcd 194 | kw 195 | ug 196 | gv 197 | udm 198 | ary 199 | ay 200 | nrm 201 | zea 202 | gn 203 | bjn 204 | mt 205 | skr 206 | lez 207 | lfn 208 | smn 209 | stq 210 | lo 211 | mwl 212 | olo 213 | rm 214 | fur 215 | lad 216 | gom 217 | ang 218 | ig 219 | koi 220 | ext 221 | tyv 222 | dsb 223 | dty 224 | ln 225 | cbk-zam 226 | dv 227 | rw 228 | ksh 229 | gag 230 | bxr 231 | pfl 232 | av 233 | pag 234 | pi 235 | haw 236 | awa 237 | tay 238 | pap 239 | krc 240 | xal 241 | szy 242 | za 243 | inh 244 | kaa 245 | pdc 246 | atj 247 | to 248 | arc 249 | kbp 250 | tpi 251 | jam 252 | tw 253 | na 254 | wo 255 | mdf 256 | dag 257 | kbd 258 | tcy 259 | nov 260 | ki 261 | nia 262 | tet 263 | lg 264 | bi 265 | jbo 266 | roa-rup 267 | fj 268 | kg 269 | xh 270 | lbe 271 | ty 272 | nqo 273 | mnw 274 | tum 275 | cu 276 | shi 277 | ks 278 | trv 279 | srn 280 | om 281 | sm 282 | gcr 283 | alt 284 | ltg 285 | chr 286 | pih 287 | ny 288 | got 289 | mad 290 | st 291 | ami 292 | kl 293 | rmy 294 | tn 295 | bm 296 | ts 297 | chy 298 | ve 299 | rn 300 | iu 301 | ak 302 | ss 303 | ch 304 | pnt 305 | ady 306 | ik 307 | ee 308 | ff 309 | din 310 | sg 311 | ti 312 | dz 313 | pwn 314 | cr 315 | ng 316 | cho 317 | mh 318 | kj 319 | ii 320 | ho 321 | lrc 322 | aa 323 | -------------------------------------------------------------------------------- /config/TAGS_HTML_HEADERS.tsv: -------------------------------------------------------------------------------- 1 | h1 2 | h2 3 | h3 4 | h4 5 | h5 6 | h6 7 | -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phucty/wtabhtml/f4fecc3dcaaa182cf3f2f5ac53be99c8a6172e92/config/__init__.py -------------------------------------------------------------------------------- /config/config.py: -------------------------------------------------------------------------------- 1 | from core.utils import io_worker as iw 2 | 3 | DIR_ROOT = "/Users/phucnguyen/git/wtabhtml" 4 | DUMPS_VERSION_WP_HTML = "20220301" 5 | 6 | # Configuration 7 | ENCODING = "utf-8" 8 | 9 | # Directories 10 | DIR_DUMPS = f"{DIR_ROOT}/data/dump" 11 | DIR_MODELS = f"{DIR_ROOT}/data/models" 12 | DIR_CONFIG = f"{DIR_ROOT}/config" 13 | 14 | # 322 languages of Wikipedia 15 | LANGS = iw.read_tsv_file_first_col(f"{DIR_CONFIG}/LANGS_322.tsv", ENCODING) 16 | 17 | HTML_HEADERS = iw.read_tsv_file_first_col( 18 | f"{DIR_CONFIG}/TAGS_HTML_HEADERS.tsv", ENCODING 19 | ) 20 | 21 | URL_WP_HTML = "https://dumps.wikimedia.org/other/enterprise_html/runs/{wikipedia_version}/{lang}wiki-NS0-{wikipedia_version}-ENTERPRISE-HTML.json.tar.gz" 22 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phucty/wtabhtml/f4fecc3dcaaa182cf3f2f5ac53be99c8a6172e92/core/__init__.py -------------------------------------------------------------------------------- /core/downloader.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | from config import config as cf 4 | import os 5 | from core.utils import io_worker as iw 6 | import requests 7 | 8 | 9 | def download_file(download_url): 10 | dump_file = download_url.split("/")[-1] 11 | downloaded_file = f"{cf.DIR_DUMPS}/{dump_file}" 12 | 13 | if os.path.exists(downloaded_file): 14 | return downloaded_file 15 | iw.create_dir(downloaded_file) 16 | r = requests.get(download_url, stream=True) 17 | if r.status_code != 200: 18 | return None 19 | p_bar = tqdm( 20 | total=int(r.headers.get("content-length", 0)), 21 | unit="B", 22 | unit_scale=True, 23 | desc=dump_file, 24 | ) 25 | with open(f"{cf.DIR_DUMPS}/{dump_file}", "wb") as f: 26 | for data in r.iter_content(10240): 27 | p_bar.update(len(data)) 28 | f.write(data) 29 | p_bar.close() 30 | return downloaded_file 31 | 32 | 33 | def download_wikipedia_html_dump(wikipedia_version=cf.DUMPS_VERSION_WP_HTML, lang="ja"): 34 | # Download Wikipedia dumps 35 | url = cf.URL_WP_HTML.format(wikipedia_version=wikipedia_version, lang=lang) 36 | downloaded_file = download_file(url) 37 | 38 | if downloaded_file: 39 | downloaded_size = iw.get_size_of_file(os.path.getsize(downloaded_file)) 40 | print(f"Downloaded: {downloaded_size} - {downloaded_file}") 41 | else: 42 | print(f"Error: {url}") 43 | return downloaded_file 44 | -------------------------------------------------------------------------------- /core/parse_wikitable_html.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import json 3 | import os.path 4 | import re 5 | from collections import defaultdict 6 | 7 | import bs4 8 | import ujson 9 | from tqdm import tqdm 10 | 11 | from core.utils import io_worker as iw 12 | from config import config as cf 13 | 14 | 15 | def normalize_wikitables_css(soup, table): 16 | has_header = False 17 | end_header = False 18 | thead = soup.new_tag("thead") 19 | for i1, tag_1 in enumerate(table): 20 | if tag_1.name != "tbody": 21 | continue 22 | # tbody ta 23 | for i2, tag2 in enumerate(tag_1): 24 | if tag2.name != "tr": 25 | continue 26 | if not end_header and all( 27 | (col.name in ["th", None] and col.name not in ["td"]) for col in tag2 28 | ): 29 | tag2.extract() 30 | thead.append(tag2) 31 | has_header = True 32 | else: 33 | end_header = True 34 | if has_header: 35 | table.insert(0, thead) 36 | 37 | def filter_attr(bs_obj, white_tags): 38 | bs_obj.attrs = { 39 | attr: v for attr, v in bs_obj.attrs.items() if attr in white_tags 40 | } 41 | 42 | filter_attr(table, ["border", "cellpadding", "style"]) 43 | for a in table.findAll(True): 44 | filter_attr(a, ["colspan", "headers", "rowspan", "cellpadding", "style"]) 45 | 46 | for tag in ["a", "span", "link", "img"]: 47 | for a in table.findAll(tag): 48 | a.unwrap() 49 | 50 | for tag in ["sup"]: 51 | for a in table.findAll(tag): 52 | a.extract() 53 | 54 | # add css 55 | # table.attrs["background-color"] = "#f8f9fa" 56 | # table.attrs["color"] = "#202122" 57 | # table.attrs["margin"] = "1em 0" 58 | table.attrs["border"] = "1" 59 | # table.attrs["border-collapse"] = "collapse" 60 | return table 61 | 62 | 63 | def extract_html_tables_from_html(html_content): 64 | results = [] 65 | if not html_content: 66 | return results 67 | 68 | soup = bs4.BeautifulSoup(html_content, "html.parser") 69 | html_tables = soup.find_all("table", {"class": re.compile("wikitable*")}) 70 | tables = [] 71 | for i, html_table in enumerate(html_tables): 72 | 73 | # Check this table is a nested table or not 74 | # We ignore the nested tables, just process wikitables do not have any wikitable inside 75 | sub_wikitables = html_table.find("table", {"class": re.compile("wikitable*")}) 76 | if sub_wikitables: 77 | continue 78 | 79 | table = {} 80 | # Get table caption 81 | tag_caption = html_table.find("caption") 82 | if tag_caption: 83 | table["caption"] = tag_caption.get_text().strip() 84 | 85 | # Get section hierarchy 86 | cur = html_table 87 | while True: 88 | section = cur.find_parent("section") 89 | if not section: 90 | break 91 | section_name = section.next 92 | if section_name and section_name.name in cf.HTML_HEADERS: 93 | if table.get("aspects") is None: 94 | table["aspects"] = [] 95 | table["aspects"].append(section_name.get_text()) 96 | cur = section 97 | 98 | if table.get("aspects") and len(table["aspects"]) > 1: 99 | table["aspects"] = table["aspects"][::-1] 100 | 101 | html_table = normalize_wikitables_css(soup, html_table) 102 | table["html"] = str(html_table) 103 | tables.append(table) 104 | 105 | return tables 106 | 107 | 108 | def add_css_wikitable(html_source): 109 | """ 110 | Add css of wikitable to the html source 111 | :param html_source: 112 | :type html_source: 113 | :return: 114 | :rtype: 115 | """ 116 | if isinstance(html_source, bytes) or isinstance(html_source, str): 117 | html_source = bs4.BeautifulSoup(html_source, "html.parser") 118 | tables = html_source.find_all("table", {"class": re.compile("wikitable*")}) 119 | else: 120 | tables = [html_source] 121 | for table in tables: 122 | table.attrs["background-color"] = "#f8f9fa" 123 | table.attrs["color"] = "#202122" 124 | table.attrs["margin"] = "1em 0" 125 | table.attrs["border"] = "1px solid #a2a9b1" 126 | table.attrs["border-collapse"] = "collapse" 127 | 128 | return str(html_source) 129 | 130 | 131 | def pool_parse_html_source(line): 132 | if ( 133 | not line 134 | or not line.get("article_body") 135 | or not line["article_body"].get("html") 136 | or "wikitable" not in line["article_body"]["html"] 137 | ): 138 | return None 139 | 140 | if not line.get("main_entity") or not line["main_entity"].get("identifier"): 141 | return None 142 | 143 | wikitables_html = extract_html_tables_from_html(line["article_body"]["html"]) 144 | if not wikitables_html: 145 | return None 146 | table_objs = [] 147 | for i, wikitable in enumerate(wikitables_html): 148 | table_obj = {"index": i} 149 | 150 | def update_dict(attr, value): 151 | if value: 152 | table_obj[attr] = value 153 | 154 | if line.get("main_entity") and line["main_entity"].get("identifier"): 155 | update_dict("wikidata", line["main_entity"]["identifier"]) 156 | if not table_obj.get("wikidata"): 157 | continue 158 | 159 | update_dict("title", line.get("name")) 160 | update_dict("url", line.get("url")) 161 | update_dict("html", wikitable.get("html")) 162 | update_dict("caption", wikitable.get("caption")) 163 | update_dict("aspects", wikitable.get("aspects")) 164 | 165 | # table_obj["html"] = add_css_wikitable(table_obj["html"]) 166 | 167 | table_objs.append(table_obj) 168 | return table_objs 169 | 170 | 171 | def parse_wikitables(input_file=None): 172 | dump_file = iw.read_line_from_file(input_file, mode="rb") 173 | for line in dump_file: 174 | try: 175 | line_obj = ujson.loads(line) 176 | parsed_objs = pool_parse_html_source(line_obj) 177 | if parsed_objs: 178 | yield parsed_objs 179 | except ValueError: 180 | continue 181 | 182 | 183 | def dump_wikitables( 184 | lang="ja", input_file=None, outfile=None, limit=0, step=1000, progress=True 185 | ): 186 | if input_file is None: 187 | input_file = f"{cf.DIR_DUMPS}/{lang}wiki-NS0-{cf.DUMPS_VERSION_WP_HTML}-ENTERPRISE-HTML.json.tar.gz" 188 | if not os.path.exists(input_file): 189 | return 190 | 191 | if not outfile: 192 | if limit: 193 | outfile = ( 194 | f"{cf.DIR_MODELS}/wikitables_html_pubtabnet/{lang}_{limit}.jsonl.bz2" 195 | ) 196 | else: 197 | outfile = f"{cf.DIR_MODELS}/wikitables_html_pubtabnet/{lang}.jsonl.bz2" 198 | if os.path.exists(outfile): 199 | return outfile 200 | 201 | iw.create_dir(outfile) 202 | 203 | if outfile.endswith(".bz2"): 204 | jsonFile = bz2.open(outfile, "wt") 205 | else: 206 | jsonFile = open(outfile, "w") 207 | 208 | parser = parse_wikitables(input_file) 209 | n = 0 210 | i = 0 211 | 212 | def update_desc(i): 213 | return f"Parse Wikitable {lang}. Saved {n:,} tables / {i:,} pages" 214 | 215 | p_bar = None 216 | if progress: 217 | p_bar = tqdm(desc=update_desc(0)) 218 | 219 | for parsed_objs in parser: 220 | if limit and n >= limit: 221 | break 222 | if progress and i and i % step == 0: 223 | p_bar.update(step) 224 | p_bar.set_description(desc=update_desc(i)) 225 | for parsed_obj in parsed_objs: 226 | n += 1 227 | jsonString = ujson.dumps(parsed_obj) 228 | jsonFile.write(jsonString) 229 | jsonFile.write("\n") 230 | i += 1 231 | if progress: 232 | p_bar.set_description(desc=update_desc(i)) 233 | jsonFile.close() 234 | return outfile 235 | 236 | 237 | def func_modify_table_border(table_obj): 238 | table_obj["html"] = table_obj["html"].replace("1px solid #a2a9b1", "1") 239 | return table_obj 240 | 241 | 242 | def modify_json_dump(input_folder, func): 243 | dump_files = iw.get_files_from_dir(input_folder, is_sort=True, reverse=True) 244 | for dump_file in dump_files: 245 | file_name = os.path.basename(dump_file).split(".")[0] 246 | dir_output = dump_file + ".tmp" 247 | 248 | if dump_file.endswith(".bz2"): 249 | output_file = bz2.open(dir_output, "wt") 250 | else: 251 | output_file = open(dir_output, "w") 252 | 253 | iter_obj = iw.read_json_file(dump_file) 254 | for table_obj in tqdm(iter_obj, desc=file_name): 255 | table_obj = func(table_obj) 256 | output_file.write(ujson.dumps(table_obj)) 257 | output_file.write("\n") 258 | 259 | output_file.close() 260 | iw.delete_file(dump_file) 261 | os.rename(dir_output, dump_file) 262 | 263 | 264 | def read_wikitable_dumps(input_file: str, limit: int = 0): 265 | for i, table_obj in enumerate(iw.read_json_file(input_file, limit)): 266 | print(json.dumps(table_obj, indent=2, ensure_ascii=False)) 267 | 268 | 269 | def get_jsonl_size(input_file: str): 270 | size = 0 271 | for _ in iw.read_json_file(input_file): 272 | size += 1 273 | return size 274 | 275 | 276 | def analyze_wikitables(input_folder: str = cf.DIR_MODELS, limit=0, step=1000): 277 | """ 278 | Show stats of tables 279 | """ 280 | dump_files = iw.get_files_from_dir(input_folder, is_sort=True, reverse=False) 281 | stats = defaultdict() 282 | 283 | for dump_file in dump_files: 284 | file_name = os.path.basename(dump_file).split(".")[0] 285 | 286 | n_tables, n_pages, n_caption, n_aspects = 0, 0, 0, 0 287 | 288 | def update_desc(): 289 | if n_tables and n_pages: 290 | return f"{file_name}. {n_pages:,} pages | {n_tables/n_pages:.2f} tab/page | {n_caption/n_tables*100:.2f}% cap/tab" 291 | else: 292 | return "" 293 | 294 | pre_title = None 295 | p_bar = tqdm(desc=update_desc()) 296 | try: 297 | iter_obj = iw.read_json_file(dump_file) 298 | for table_obj in iter_obj: 299 | n_tables += 1 300 | p_bar.update() 301 | if pre_title != table_obj["title"]: 302 | n_pages += 1 303 | pre_title = table_obj["title"] 304 | if n_pages % step == 0: 305 | p_bar.set_description(desc=update_desc()) 306 | 307 | if table_obj.get("caption"): 308 | n_caption += 1 309 | 310 | if table_obj.get("aspects"): 311 | n_aspects += 1 312 | if limit and n_tables >= 1000: 313 | break 314 | p_bar.set_description(desc=update_desc()) 315 | p_bar.close() 316 | except EOFError: 317 | stats[file_name] = [0, 0, 0, 0, 0, 0] 318 | continue 319 | 320 | stats[file_name] = [ 321 | n_pages, 322 | n_tables, 323 | n_caption, 324 | n_aspects, 325 | n_caption / n_tables * 100 if n_tables else 0, 326 | n_aspects / n_tables * 100 if n_tables else 0, 327 | ] 328 | 329 | headers = [ 330 | "No", 331 | "Dump", 332 | "Pages", 333 | "Tables", 334 | "Captions", 335 | "Aspects", 336 | "Captions/Table", 337 | "Aspects/Table", 338 | ] 339 | 340 | iw.print_status("\t".join(headers)) 341 | for i, (file_name, stats_obj) in enumerate(stats.items()): 342 | iw.print_status( 343 | f"{i+1}\t{file_name}\t" 344 | + "\t".join(f"{obj_i:,}" for obj_i in stats_obj[:4]) 345 | + "\t" 346 | + "\t".join(f"{obj_i:.2f}" for obj_i in stats_obj[4:]) 347 | ) 348 | -------------------------------------------------------------------------------- /core/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phucty/wtabhtml/f4fecc3dcaaa182cf3f2f5ac53be99c8a6172e92/core/utils/__init__.py -------------------------------------------------------------------------------- /core/utils/io_worker.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import csv 3 | import fnmatch 4 | import gzip 5 | import logging 6 | import math 7 | import os 8 | import pickle 9 | import shutil 10 | import zlib 11 | 12 | import numpy 13 | import ujson 14 | import tarfile 15 | 16 | 17 | def read_tsv_file_first_col(file_name, encoding): 18 | with open(file_name, encoding=encoding) as f: 19 | first_col = [l[0].rstrip() for l in csv.reader(f, delimiter="\t")] 20 | return first_col 21 | 22 | 23 | def read_line_from_file(file_name, mode="r"): 24 | if ".bz2" in file_name: 25 | reader = bz2.BZ2File(file_name, mode=mode) 26 | elif ".gz" in file_name: 27 | reader = gzip.open(file_name, mode=mode) 28 | else: 29 | reader = open(file_name, mode=mode) 30 | if reader: 31 | for line in reader: 32 | yield line 33 | 34 | 35 | def print_status(message, is_screen=True, is_log=True) -> object: 36 | if isinstance(message, int): 37 | message = f"{message:,}" 38 | 39 | if is_screen: 40 | print(message) 41 | if is_log: 42 | logging.info(message) 43 | 44 | 45 | def print_stats_dicts( 46 | message, print_obj, is_screen=True, is_log=True, delimiter="\t", header=None 47 | ): 48 | print_status(message) 49 | if header: 50 | print_status("\t".join(header)) 51 | for i, (k, v) in enumerate(print_obj.items()): 52 | if isinstance(v, list) or isinstance(v, tuple): 53 | v = f"{delimiter}".join([str(v_i) for v_i in v]) 54 | print_status(f"{i + 1}{delimiter}{k}{delimiter}{v}", is_screen, is_log) 55 | 56 | 57 | def delete_folder(folder_dir): 58 | if os.path.exists(folder_dir): 59 | shutil.rmtree(folder_dir, ignore_errors=False) 60 | return True 61 | 62 | 63 | def delete_file(file_dir): 64 | if os.path.exists(file_dir): 65 | os.remove(file_dir) 66 | return True 67 | 68 | 69 | def create_dir(file_dir): 70 | folder_dir = os.path.dirname(file_dir) 71 | if not os.path.exists(folder_dir): 72 | os.makedirs(folder_dir) 73 | 74 | 75 | def save_obj_pkl(file_name, save_object, is_compress=False, is_message=True): 76 | create_dir(file_name) 77 | save_file = file_name 78 | if ".pkl" not in file_name: 79 | save_file = file_name + ".pkl" 80 | if is_compress and ".zlib" not in file_name: 81 | save_file += ".zlib" 82 | 83 | temp_file = save_file + ".temp" 84 | 85 | # Write temp 86 | with open(temp_file, "wb") as fp: 87 | if is_compress: 88 | save_data = zlib.compress( 89 | pickle.dumps(save_object, pickle.HIGHEST_PROTOCOL) 90 | ) 91 | fp.write(save_data) 92 | else: 93 | pickle.dump(save_object, fp, pickle.HIGHEST_PROTOCOL) 94 | 95 | try: 96 | if os.path.exists(save_file): 97 | os.remove(save_file) 98 | except Exception as message: 99 | print_status(message) 100 | 101 | os.rename(temp_file, save_file) 102 | if is_message: 103 | print_status("Saved: - %d - %s" % (len(save_object), save_file), is_log=False) 104 | return save_file 105 | 106 | 107 | def load_obj_pkl(file_name, is_message=False): 108 | load_obj = None 109 | if not os.path.exists(file_name) and ".pkl" not in file_name: 110 | file_name = file_name + ".pkl" 111 | 112 | if not os.path.exists(file_name) and ".zlib" not in file_name: 113 | file_name = file_name + ".zlib" 114 | with open(file_name, "rb") as fp: 115 | if ".zlib" in file_name: 116 | load_obj = pickle.loads(zlib.decompress(fp.read())) 117 | else: 118 | load_obj = pickle.load(fp) 119 | if is_message and load_obj: 120 | print_status("%d loaded items - %s" % (len(load_obj), file_name)) 121 | return load_obj 122 | 123 | 124 | def get_size_of_file(num, suffix="B"): 125 | """Get human friendly file size 126 | https://gist.github.com/cbwar/d2dfbc19b140bd599daccbe0fe925597#gistcomment-2845059 127 | 128 | Args: 129 | num (int): Bytes value 130 | suffix (str, optional): Unit. Defaults to 'B'. 131 | 132 | Returns: 133 | str: file size0 134 | """ 135 | if num == 0: 136 | return "0" 137 | magnitude = int(math.floor(math.log(num, 1024))) 138 | val = num / math.pow(1024, magnitude) 139 | if magnitude > 7: 140 | return "{:3.1f}{}{}".format(val, "Yi", suffix) 141 | return "{:3.1f}{}{}".format( 142 | val, ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"][magnitude], suffix 143 | ) 144 | 145 | 146 | def load_object_csv(file_name, encoding="utf-8", retries=2): 147 | content = [] 148 | if os.path.exists(file_name): 149 | with open(file_name, "r", encoding=encoding) as f: 150 | reader = csv.reader(f, delimiter=",") 151 | for r in reader: 152 | content.append(r) 153 | return content 154 | 155 | 156 | def save_object_csv(file_name, rows): 157 | create_dir(file_name) 158 | temp_file = "%s.temp" % file_name 159 | with open(temp_file, "w") as f: 160 | try: 161 | writer = csv.writer(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL) 162 | for r in rows: 163 | if ( 164 | isinstance(r, list) 165 | or isinstance(r, tuple) 166 | or isinstance(r, numpy.ndarray) 167 | ): 168 | writer.writerow(r) 169 | else: 170 | writer.writerow([r]) 171 | except Exception as message: 172 | print(message) 173 | if os.path.exists(file_name): 174 | os.remove(file_name) 175 | os.rename(temp_file, file_name) 176 | 177 | 178 | def read_json_file(input_file: str, limit: int = 0): 179 | if input_file.endswith(".bz2"): 180 | jsonFile = bz2.BZ2File(input_file) 181 | else: 182 | jsonFile = open(input_file, "r") 183 | i = 0 184 | limit = int(limit) 185 | while True: 186 | line = jsonFile.readline() 187 | if not line: 188 | break 189 | i += 1 190 | if limit and i > limit: 191 | break 192 | table_obj = ujson.loads(line) 193 | yield table_obj 194 | jsonFile.close() 195 | 196 | 197 | def get_files_from_dir_stream(folder_path, extension="*"): 198 | for root, _, file_dirs in os.walk(folder_path): 199 | for file_dir in fnmatch.filter(file_dirs, "*.%s" % extension): 200 | if ".DS_Store" not in file_dir: 201 | yield os.path.join(root, file_dir) 202 | 203 | 204 | def get_files_from_dir_subdir(folder_path, extension="*"): 205 | all_files = [] 206 | for root, _, file_dirs in os.walk(folder_path): 207 | for file_dir in fnmatch.filter(file_dirs, "*.%s" % extension): 208 | if ".DS_Store" not in file_dir: 209 | all_files.append(os.path.join(root, file_dir)) 210 | return all_files 211 | 212 | 213 | def get_files_from_dir( 214 | folder_path, extension="*", limit_reader=-1, is_sort=False, reverse=False 215 | ): 216 | all_file_dirs = get_files_from_dir_subdir(folder_path, extension) 217 | 218 | if is_sort: 219 | file_with_size = [(f, os.path.getsize(f)) for f in all_file_dirs] 220 | file_with_size.sort(key=lambda f: f[1], reverse=reverse) 221 | all_file_dirs = [f for f, _ in file_with_size] 222 | if limit_reader < 0: 223 | 224 | limit_reader = len(all_file_dirs) 225 | return all_file_dirs[:limit_reader] 226 | 227 | 228 | def compress_folder(input_folder, output_file, delete_org=False): 229 | if not os.path.exists(input_folder): 230 | return 231 | # input_files = get_files_from_dir(input_folder) 232 | 233 | with tarfile.open(output_file, "w:bz2") as tar_handle: 234 | for root, dirs, files in os.walk(input_folder): 235 | for file in files: 236 | tar_handle.add( 237 | os.path.join(root, file), 238 | arcname=os.path.join(root, file).replace(input_folder, ""), 239 | ) 240 | # for file in input_files: 241 | # tar.add(file, arcname=os.path.basename(file), recursive=False) 242 | if delete_org: 243 | delete_folder(input_folder) 244 | 245 | 246 | def merge_jsonl_files(input_folder): 247 | if not os.path.exists(input_folder): 248 | return 249 | output_file = input_folder + ".jsonl.bz2" 250 | 251 | file_writer = bz2.open(output_file, "wt") 252 | n = 0 253 | input_files = get_files_from_dir(input_folder) 254 | for input_file in input_files: 255 | for line in read_json_file(input_file): 256 | file_writer.write(ujson.dumps(line)) 257 | file_writer.write("\n") 258 | n += 1 259 | 260 | file_writer.close() 261 | delete_folder(input_folder) 262 | return n 263 | -------------------------------------------------------------------------------- /core/wikitable_to_image.py: -------------------------------------------------------------------------------- 1 | import bz2 2 | import json 3 | import multiprocessing 4 | import os 5 | import random 6 | import re 7 | import time 8 | from io import BytesIO 9 | 10 | import cv2 11 | import numpy as np 12 | import ujson 13 | from PIL import Image 14 | from bs4 import BeautifulSoup 15 | from selenium import webdriver 16 | from selenium.webdriver.common.by import By 17 | from selenium.webdriver.firefox.options import Options 18 | from selenium.webdriver.support import expected_conditions as EC 19 | from selenium.webdriver.support.ui import WebDriverWait 20 | 21 | from config import config as cf 22 | from core.parse_wikitable_html import get_jsonl_size 23 | from core.utils import io_worker as iw 24 | from core.utils.io_worker import merge_jsonl_files 25 | 26 | 27 | def html_to_img(driver, html_content, id_count): 28 | """converts html to image and bounding boxes of each cell""" 29 | counter = 1 # This counter is to keep track of the exceptions and stop execution after 10 exceptions have occurred 30 | add_border = 2 31 | while True: 32 | try: 33 | driver.get("data:text/html;charset=utf-8," + html_content) 34 | 35 | el = driver.find_element_by_tag_name("table") 36 | png = el.screenshot_as_png 37 | 38 | im = Image.open(BytesIO(png)) 39 | 40 | table_loc = el.location 41 | 42 | bboxes = [] 43 | for id in range(id_count): 44 | # print(id) 45 | e = WebDriverWait(driver, 3).until( 46 | EC.presence_of_element_located((By.ID, str(id))) 47 | ) 48 | txt = e.text.strip() 49 | lentext = len(txt) 50 | loc = e.location 51 | size_ = e.size 52 | xmin = loc["x"] - table_loc["x"] - add_border 53 | ymin = loc["y"] - table_loc["y"] - add_border 54 | xmax = int(size_["width"] + xmin) + add_border * 2 55 | ymax = int(size_["height"] + ymin) + add_border * 2 56 | bboxes.append([lentext, txt, xmin, ymin, xmax, ymax]) 57 | 58 | return im, bboxes 59 | except Exception as e: 60 | counter += 1 61 | return None, None 62 | # if counter==10: 63 | # return im,None 64 | 65 | # continue 66 | 67 | 68 | def html_string2list(html_string): 69 | """this function convert string into list of char and html tag""" 70 | list_ = [] 71 | idx_tag = -1 72 | for i, char in enumerate(html_string): 73 | if char == "<": 74 | idx_tag = i 75 | elif idx_tag != -1 and char == ">": 76 | html_tag = html_string[idx_tag : i + 1] 77 | 78 | # ignore comment inside cell content 79 | if html_tag.startswith("