├── README.md
├── cli
    ├── __init__.py
    ├── downloader.py
    ├── parser.py
    ├── pipeline.py
    └── reader.py
├── config
    ├── LANGS_322.tsv
    ├── TAGS_HTML_HEADERS.tsv
    ├── __init__.py
    └── config.py
├── core
    ├── __init__.py
    ├── downloader.py
    ├── parse_wikitable_html.py
    ├── utils
    │   ├── __init__.py
    │   └── io_worker.py
    └── wikitable_to_image.py
├── data
    └── dump
    │   └── crwiki-NS0-20220301-ENTERPRISE-HTML.json.tar.gz
├── requirements.txt
├── run.py
└── wtabhtml.py


/README.md:
--------------------------------------------------------------------------------
 1 | # WTabHTML: HTML Wikitables extractor 
 2 | 
 3 | ### Input:
 4 | - Wikipedia HTML dump
 5 | - Language
 6 | 
 7 | ### Output:
 8 | File format: JSON list. Each line is a json object of
 9 | ```
10 | {
11 |     title: wikipedia title
12 |     wikidata: wikidata ID
13 |     url: the url that link to Wikipedia page
14 |     index: the index of table in the Wikipedia page
15 |     html: html content of table
16 |     caption: table caption
17 |     aspects: (Hierachy sections of Wikipedia)  
18 | }
19 | ```
20 | 
21 | ### Usage:
22 | #### Download, Extract, and dump wikitables in CR language
23 | ```shell
24 | python wtabhtml.py dump -l cr
25 | ```
26 | 
27 | #### Download, Extract, dump wikitables, and generate table images in CR language 
28 | 
29 | ```shell
30 | python wtabhtml.py gen-images -l cr -n 3
31 | ```
32 | Note: User can download our [preprocessed dumps](https://drive.google.com/drive/folders/1wU5zdHcb3egxpwyluZCqVBIZnSanUwqN?usp=sharing) then, copy all {LANGUAGE}.jsonl.bz2 (the wikitables dump in PubTabNet format) to `wtabhtml/data/models/wikitables_html_pubtabnet` to generate photo images faster.
33 | 
34 | 
35 | If user want to re-run all pipeline, the tool will download Wikipedia HTML dump, extract wikitables, and dump it to `wtabhtml/data/models/wikitables_html_pubtabnet\{LANGUAGE}.jsonl.bz2` file as the following pipeline.
36 | 
37 | #### Pipeline of Wikitable processing in cr language
38 | ```shell
39 | # Download dump
40 | python wtabhtml.py download -l cr
41 | # Parse dump and save json file
42 | python wtabhtml.py parse -l cr
43 | # Read dump
44 | python wtabhtml.py read -l 1 -i ./data/models/cr.jsonl.bz2
45 | # Generate images
46 | python wtabhtml.py gen-images -l cr -n 3
47 | ```
48 | 
49 | ### Contact
50 | Phuc Nguyen (`phucnt@nii.ac.jp`)
51 | 


--------------------------------------------------------------------------------
/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from cli import downloader, parser, reader, pipeline
 3 | 
 4 | cli_wikitables = click.CommandCollection(
 5 |     sources=[
 6 |         parser.cli_parser,
 7 |         reader.cli_reader,
 8 |         downloader.cli_downloader,
 9 |         pipeline.cli_pipeline,
10 |     ]
11 | )
12 | 


--------------------------------------------------------------------------------
/cli/downloader.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from config import config as cf
 3 | from core.downloader import download_wikipedia_html_dump
 4 | 
 5 | 
 6 | @click.group()
 7 | def cli_downloader():
 8 |     pass
 9 | 
10 | 
11 | @cli_downloader.command()
12 | @click.option(
13 |     "-p",
14 |     "--wikipedia_version",
15 |     default=cf.DUMPS_VERSION_WP_HTML,
16 |     show_default=True,
17 |     help="Version of Wikipedia HTML dump. Find at https://dumps.wikimedia.org/other/enterprise_html/runs/",
18 | )
19 | @click.option(
20 |     "-l",
21 |     "--language",
22 |     default="ja",
23 |     show_default=True,
24 |     help="Download the Wikipedia dump of language edition",
25 | )
26 | def download(wikipedia_version, language):
27 |     download_wikipedia_html_dump(wikipedia_version, language)
28 | 


--------------------------------------------------------------------------------
/cli/parser.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from config import config as cf
 3 | from core import parse_wikitable_html
 4 | 
 5 | 
 6 | @click.group()
 7 | def cli_parser():
 8 |     pass
 9 | 
10 | 
11 | @cli_parser.command()
12 | @click.option(
13 |     "-l",
14 |     "--language",
15 |     default="ja",
16 |     show_default=True,
17 |     help="Parse dump in the langauge",
18 | )
19 | @click.option(
20 |     "-f",
21 |     "--downloaded_file",
22 |     default=None,
23 |     show_default=True,
24 |     help="Directory of the downloaded file (Wikipedia HTML dump)",
25 | )
26 | @click.option(
27 |     "-t", "--limit_table", default=0, show_default=True, help="Save # number of tables",
28 | )
29 | def parse(language, downloaded_file, limit_table):
30 |     parse_wikitable_html.dump_wikitables(
31 |         lang=language, input_file=downloaded_file, limit=limit_table
32 |     )
33 | 


--------------------------------------------------------------------------------
/cli/pipeline.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from contextlib import closing
  3 | from multiprocessing import Pool
  4 | 
  5 | import click
  6 | from config import config as cf
  7 | from core import parse_wikitable_html, downloader
  8 | from core.utils import io_worker as iw
  9 | from core import wikitable_to_image
 10 | 
 11 | 
 12 | @click.group()
 13 | def cli_pipeline():
 14 |     pass
 15 | 
 16 | 
 17 | def pool_run_dump(args):
 18 |     wikipedia_version, language = args
 19 |     downloaded_file = downloader.download_wikipedia_html_dump(
 20 |         wikipedia_version, language
 21 |     )
 22 |     if not downloaded_file:
 23 |         return None
 24 |     dump_file = parse_wikitable_html.dump_wikitables(
 25 |         lang=language, input_file=downloaded_file, progress=True
 26 |     )
 27 |     return dump_file
 28 | 
 29 | 
 30 | def run_dump(wikipedia_version, language, n_threads):
 31 |     if language != "all":
 32 |         languages = [language]
 33 |     else:
 34 |         languages = cf.LANGS
 35 | 
 36 |     args = [[wikipedia_version, l] for l in reversed(languages)]
 37 | 
 38 |     with closing(Pool(processes=n_threads)) as p:
 39 |         for i, dump_file in enumerate(p.imap_unordered(pool_run_dump, args)):
 40 |             if not dump_file:
 41 |                 continue
 42 |             dump_size = iw.get_size_of_file(os.path.getsize(dump_file))
 43 |             print(f"{i + 1}. Dump {language} Saved: {dump_size} - {dump_file}: ")
 44 | 
 45 | 
 46 | @cli_pipeline.command()
 47 | @click.option(
 48 |     "-p",
 49 |     "--wikipedia_version",
 50 |     default=cf.DUMPS_VERSION_WP_HTML,
 51 |     show_default=True,
 52 |     help="Version of Wikipedia HTML dump. Find at https://dumps.wikimedia.org/other/enterprise_html/runs/",
 53 | )
 54 | @click.option(
 55 |     "-l",
 56 |     "--language",
 57 |     default="all",
 58 |     show_default=True,
 59 |     help="Parse the Wikipedia dump of language edition",
 60 | )
 61 | @click.option(
 62 |     "-n", "--n_threads", default=1, show_default=True, help="Run n multiprocessors",
 63 | )
 64 | def dump_json(wikipedia_version, language, n_threads):
 65 |     run_dump(wikipedia_version, language, n_threads)
 66 | 
 67 | 
 68 | @cli_pipeline.command()
 69 | @click.option(
 70 |     "-p",
 71 |     "--wikipedia_version",
 72 |     default=cf.DUMPS_VERSION_WP_HTML,
 73 |     show_default=True,
 74 |     help="Version of Wikipedia HTML dump. Find at https://dumps.wikimedia.org/other/enterprise_html/runs/",
 75 | )
 76 | @click.option(
 77 |     "-l",
 78 |     "--language",
 79 |     default="all",
 80 |     show_default=True,
 81 |     help="Parse the Wikipedia dump of language edition",
 82 | )
 83 | @click.option(
 84 |     "-n", "--n_threads", default=1, show_default=True, help="Run n multiprocessors",
 85 | )
 86 | @click.option(
 87 |     "-c",
 88 |     "--compress",
 89 |     default=False,
 90 |     show_default=True,
 91 |     help="Compress the output dataset or not",
 92 | )
 93 | @click.option(
 94 |     "-d",
 95 |     "--delete_org",
 96 |     default=False,
 97 |     show_default=True,
 98 |     help="Delete the original folder after compressing",
 99 | )
100 | def gen_images(wikipedia_version, language, n_threads, compress, delete_org):
101 |     if language != "all":
102 |         languages = [language]
103 |     else:
104 |         languages = cf.LANGS
105 | 
106 |     iw.print_status(f"No\tLang\tImages\tErrors\tRunTime")
107 |     for i, language in enumerate(reversed(languages)):
108 |         n_errors, n_images, run_time = wikitable_to_image.gen_images(
109 |             wikipedia_version=wikipedia_version,
110 |             lang=language,
111 |             n_threads=n_threads,
112 |             compress=compress,
113 |             delete_org=delete_org,
114 |         )
115 |         iw.print_status(
116 |             f"{i + 1}\t{language}\t{n_images:,}\t{n_errors:,}\t{run_time:.2f}"
117 |         )
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     run_dump(cf.DUMPS_VERSION_WP_HTML, "all", 1)
122 | 


--------------------------------------------------------------------------------
/cli/reader.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from core import parse_wikitable_html
 4 | 
 5 | 
 6 | @click.group()
 7 | def cli_reader():
 8 |     pass
 9 | 
10 | 
11 | @cli_reader.command()
12 | @click.option(
13 |     "--input_file", "-i", help="Read the JSON dump of Wikipedia tables",
14 | )
15 | @click.option(
16 |     "--limit", "-l", default=0, help="Return first limit tables",
17 | )
18 | def read(input_file, limit):
19 |     parse_wikitable_html.read_wikitable_dumps(input_file, limit)
20 | 
21 | 
22 | @cli_reader.command()
23 | @click.option(
24 |     "--input_file", "-i", help="Read the JSON dump of Wikipedia tables",
25 | )
26 | def size(input_file):
27 |     print(parse_wikitable_html.get_jsonl_size(input_file))
28 | 
29 | 
30 | @cli_reader.command()
31 | @click.option(
32 |     "--input_folder", "-i", help="The folder of Wikitable JSON dumps",
33 | )
34 | def stats(input_folder):
35 |     parse_wikitable_html.read_wikitable_dumps(input_folder)
36 | 


--------------------------------------------------------------------------------
/config/LANGS_322.tsv:
--------------------------------------------------------------------------------
  1 | en
  2 | ceb
  3 | de
  4 | sv
  5 | fr
  6 | nl
  7 | ru
  8 | es
  9 | it
 10 | arz
 11 | pl
 12 | ja
 13 | vi
 14 | war
 15 | zh
 16 | ar
 17 | uk
 18 | pt
 19 | fa
 20 | ca
 21 | sr
 22 | id
 23 | ko
 24 | no
 25 | fi
 26 | hu
 27 | cs
 28 | tr
 29 | ce
 30 | sh
 31 | zh-min-nan
 32 | ro
 33 | tt
 34 | eu
 35 | ms
 36 | eo
 37 | he
 38 | hy
 39 | bg
 40 | da
 41 | azb
 42 | sk
 43 | kk
 44 | et
 45 | min
 46 | be
 47 | hr
 48 | el
 49 | simple
 50 | lt
 51 | az
 52 | gl
 53 | sl
 54 | ur
 55 | nn
 56 | ka
 57 | hi
 58 | th
 59 | ta
 60 | uz
 61 | la
 62 | cy
 63 | ast
 64 | vo
 65 | mk
 66 | zh-yue
 67 | bn
 68 | lv
 69 | tg
 70 | my
 71 | af
 72 | mg
 73 | bs
 74 | oc
 75 | sq
 76 | mr
 77 | nds
 78 | ky
 79 | ml
 80 | be-tarask
 81 | te
 82 | new
 83 | br
 84 | sw
 85 | vec
 86 | jv
 87 | pms
 88 | pnb
 89 | ht
 90 | su
 91 | lb
 92 | ba
 93 | ga
 94 | szl
 95 | is
 96 | lmo
 97 | ku
 98 | cv
 99 | fy
100 | tl
101 | wuu
102 | an
103 | ckb
104 | sco
105 | diq
106 | pa
107 | yo
108 | ne
109 | bar
110 | io
111 | gu
112 | als
113 | kn
114 | scn
115 | bpy
116 | ia
117 | qu
118 | mn
119 | avk
120 | nv
121 | xmf
122 | si
123 | crh
124 | bat-smg
125 | or
126 | gd
127 | cdo
128 | frr
129 | os
130 | ilo
131 | yi
132 | sd
133 | am
134 | bug
135 | nap
136 | ha
137 | sah
138 | hsb
139 | map-bms
140 | ps
141 | fo
142 | mai
143 | li
144 | mzn
145 | eml
146 | gor
147 | ban
148 | ace
149 | lld
150 | bcl
151 | sa
152 | wa
153 | zh-classical
154 | lij
155 | shn
156 | zu
157 | mrj
158 | mhr
159 | hif
160 | as
161 | mni
162 | hyw
163 | hak
164 | roa-tara
165 | pam
166 | km
167 | ie
168 | nso
169 | rue
170 | so
171 | bh
172 | sn
173 | se
174 | vls
175 | nds-nl
176 | myv
177 | sat
178 | mi
179 | nah
180 | sc
181 | vep
182 | gan
183 | glk
184 | kab
185 | tk
186 | fiu-vro
187 | co
188 | bo
189 | ab
190 | kv
191 | frp
192 | csb
193 | pcd
194 | kw
195 | ug
196 | gv
197 | udm
198 | ary
199 | ay
200 | nrm
201 | zea
202 | gn
203 | bjn
204 | mt
205 | skr
206 | lez
207 | lfn
208 | smn
209 | stq
210 | lo
211 | mwl
212 | olo
213 | rm
214 | fur
215 | lad
216 | gom
217 | ang
218 | ig
219 | koi
220 | ext
221 | tyv
222 | dsb
223 | dty
224 | ln
225 | cbk-zam
226 | dv
227 | rw
228 | ksh
229 | gag
230 | bxr
231 | pfl
232 | av
233 | pag
234 | pi
235 | haw
236 | awa
237 | tay
238 | pap
239 | krc
240 | xal
241 | szy
242 | za
243 | inh
244 | kaa
245 | pdc
246 | atj
247 | to
248 | arc
249 | kbp
250 | tpi
251 | jam
252 | tw
253 | na
254 | wo
255 | mdf
256 | dag
257 | kbd
258 | tcy
259 | nov
260 | ki
261 | nia
262 | tet
263 | lg
264 | bi
265 | jbo
266 | roa-rup
267 | fj
268 | kg
269 | xh
270 | lbe
271 | ty
272 | nqo
273 | mnw
274 | tum
275 | cu
276 | shi
277 | ks
278 | trv
279 | srn
280 | om
281 | sm
282 | gcr
283 | alt
284 | ltg
285 | chr
286 | pih
287 | ny
288 | got
289 | mad
290 | st
291 | ami
292 | kl
293 | rmy
294 | tn
295 | bm
296 | ts
297 | chy
298 | ve
299 | rn
300 | iu
301 | ak
302 | ss
303 | ch
304 | pnt
305 | ady
306 | ik
307 | ee
308 | ff
309 | din
310 | sg
311 | ti
312 | dz
313 | pwn
314 | cr
315 | ng
316 | cho
317 | mh
318 | kj
319 | ii
320 | ho
321 | lrc
322 | aa
323 | 


--------------------------------------------------------------------------------
/config/TAGS_HTML_HEADERS.tsv:
--------------------------------------------------------------------------------
1 | h1
2 | h2
3 | h3
4 | h4
5 | h5
6 | h6
7 | 


--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phucty/wtabhtml/f4fecc3dcaaa182cf3f2f5ac53be99c8a6172e92/config/__init__.py


--------------------------------------------------------------------------------
/config/config.py:
--------------------------------------------------------------------------------
 1 | from core.utils import io_worker as iw
 2 | 
 3 | DIR_ROOT = "/Users/phucnguyen/git/wtabhtml"
 4 | DUMPS_VERSION_WP_HTML = "20220301"
 5 | 
 6 | # Configuration
 7 | ENCODING = "utf-8"
 8 | 
 9 | # Directories
10 | DIR_DUMPS = f"{DIR_ROOT}/data/dump"
11 | DIR_MODELS = f"{DIR_ROOT}/data/models"
12 | DIR_CONFIG = f"{DIR_ROOT}/config"
13 | 
14 | # 322 languages of Wikipedia
15 | LANGS = iw.read_tsv_file_first_col(f"{DIR_CONFIG}/LANGS_322.tsv", ENCODING)
16 | 
17 | HTML_HEADERS = iw.read_tsv_file_first_col(
18 |     f"{DIR_CONFIG}/TAGS_HTML_HEADERS.tsv", ENCODING
19 | )
20 | 
21 | URL_WP_HTML = "https://dumps.wikimedia.org/other/enterprise_html/runs/{wikipedia_version}/{lang}wiki-NS0-{wikipedia_version}-ENTERPRISE-HTML.json.tar.gz"
22 | 


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phucty/wtabhtml/f4fecc3dcaaa182cf3f2f5ac53be99c8a6172e92/core/__init__.py


--------------------------------------------------------------------------------
/core/downloader.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | 
 3 | from config import config as cf
 4 | import os
 5 | from core.utils import io_worker as iw
 6 | import requests
 7 | 
 8 | 
 9 | def download_file(download_url):
10 |     dump_file = download_url.split("/")[-1]
11 |     downloaded_file = f"{cf.DIR_DUMPS}/{dump_file}"
12 | 
13 |     if os.path.exists(downloaded_file):
14 |         return downloaded_file
15 |     iw.create_dir(downloaded_file)
16 |     r = requests.get(download_url, stream=True)
17 |     if r.status_code != 200:
18 |         return None
19 |     p_bar = tqdm(
20 |         total=int(r.headers.get("content-length", 0)),
21 |         unit="B",
22 |         unit_scale=True,
23 |         desc=dump_file,
24 |     )
25 |     with open(f"{cf.DIR_DUMPS}/{dump_file}", "wb") as f:
26 |         for data in r.iter_content(10240):
27 |             p_bar.update(len(data))
28 |             f.write(data)
29 |     p_bar.close()
30 |     return downloaded_file
31 | 
32 | 
33 | def download_wikipedia_html_dump(wikipedia_version=cf.DUMPS_VERSION_WP_HTML, lang="ja"):
34 |     # Download Wikipedia dumps
35 |     url = cf.URL_WP_HTML.format(wikipedia_version=wikipedia_version, lang=lang)
36 |     downloaded_file = download_file(url)
37 | 
38 |     if downloaded_file:
39 |         downloaded_size = iw.get_size_of_file(os.path.getsize(downloaded_file))
40 |         print(f"Downloaded: {downloaded_size} - {downloaded_file}")
41 |     else:
42 |         print(f"Error: {url}")
43 |     return downloaded_file
44 | 


--------------------------------------------------------------------------------
/core/parse_wikitable_html.py:
--------------------------------------------------------------------------------
  1 | import bz2
  2 | import json
  3 | import os.path
  4 | import re
  5 | from collections import defaultdict
  6 | 
  7 | import bs4
  8 | import ujson
  9 | from tqdm import tqdm
 10 | 
 11 | from core.utils import io_worker as iw
 12 | from config import config as cf
 13 | 
 14 | 
 15 | def normalize_wikitables_css(soup, table):
 16 |     has_header = False
 17 |     end_header = False
 18 |     thead = soup.new_tag("thead")
 19 |     for i1, tag_1 in enumerate(table):
 20 |         if tag_1.name != "tbody":
 21 |             continue
 22 |         # tbody ta
 23 |         for i2, tag2 in enumerate(tag_1):
 24 |             if tag2.name != "tr":
 25 |                 continue
 26 |             if not end_header and all(
 27 |                 (col.name in ["th", None] and col.name not in ["td"]) for col in tag2
 28 |             ):
 29 |                 tag2.extract()
 30 |                 thead.append(tag2)
 31 |                 has_header = True
 32 |             else:
 33 |                 end_header = True
 34 |     if has_header:
 35 |         table.insert(0, thead)
 36 | 
 37 |     def filter_attr(bs_obj, white_tags):
 38 |         bs_obj.attrs = {
 39 |             attr: v for attr, v in bs_obj.attrs.items() if attr in white_tags
 40 |         }
 41 | 
 42 |     filter_attr(table, ["border", "cellpadding", "style"])
 43 |     for a in table.findAll(True):
 44 |         filter_attr(a, ["colspan", "headers", "rowspan", "cellpadding", "style"])
 45 | 
 46 |     for tag in ["a", "span", "link", "img"]:
 47 |         for a in table.findAll(tag):
 48 |             a.unwrap()
 49 | 
 50 |     for tag in ["sup"]:
 51 |         for a in table.findAll(tag):
 52 |             a.extract()
 53 | 
 54 |     # add css
 55 |     # table.attrs["background-color"] = "#f8f9fa"
 56 |     # table.attrs["color"] = "#202122"
 57 |     # table.attrs["margin"] = "1em 0"
 58 |     table.attrs["border"] = "1"
 59 |     # table.attrs["border-collapse"] = "collapse"
 60 |     return table
 61 | 
 62 | 
 63 | def extract_html_tables_from_html(html_content):
 64 |     results = []
 65 |     if not html_content:
 66 |         return results
 67 | 
 68 |     soup = bs4.BeautifulSoup(html_content, "html.parser")
 69 |     html_tables = soup.find_all("table", {"class": re.compile("wikitable*")})
 70 |     tables = []
 71 |     for i, html_table in enumerate(html_tables):
 72 | 
 73 |         # Check this table is a nested table or not
 74 |         # We ignore the nested tables, just process wikitables do not have any wikitable inside
 75 |         sub_wikitables = html_table.find("table", {"class": re.compile("wikitable*")})
 76 |         if sub_wikitables:
 77 |             continue
 78 | 
 79 |         table = {}
 80 |         # Get table caption
 81 |         tag_caption = html_table.find("caption")
 82 |         if tag_caption:
 83 |             table["caption"] = tag_caption.get_text().strip()
 84 | 
 85 |         # Get section hierarchy
 86 |         cur = html_table
 87 |         while True:
 88 |             section = cur.find_parent("section")
 89 |             if not section:
 90 |                 break
 91 |             section_name = section.next
 92 |             if section_name and section_name.name in cf.HTML_HEADERS:
 93 |                 if table.get("aspects") is None:
 94 |                     table["aspects"] = []
 95 |                 table["aspects"].append(section_name.get_text())
 96 |             cur = section
 97 | 
 98 |         if table.get("aspects") and len(table["aspects"]) > 1:
 99 |             table["aspects"] = table["aspects"][::-1]
100 | 
101 |         html_table = normalize_wikitables_css(soup, html_table)
102 |         table["html"] = str(html_table)
103 |         tables.append(table)
104 | 
105 |     return tables
106 | 
107 | 
108 | def add_css_wikitable(html_source):
109 |     """
110 |     Add css of wikitable to the html source
111 |     :param html_source:
112 |     :type html_source:
113 |     :return:
114 |     :rtype:
115 |     """
116 |     if isinstance(html_source, bytes) or isinstance(html_source, str):
117 |         html_source = bs4.BeautifulSoup(html_source, "html.parser")
118 |         tables = html_source.find_all("table", {"class": re.compile("wikitable*")})
119 |     else:
120 |         tables = [html_source]
121 |     for table in tables:
122 |         table.attrs["background-color"] = "#f8f9fa"
123 |         table.attrs["color"] = "#202122"
124 |         table.attrs["margin"] = "1em 0"
125 |         table.attrs["border"] = "1px solid #a2a9b1"
126 |         table.attrs["border-collapse"] = "collapse"
127 | 
128 |     return str(html_source)
129 | 
130 | 
131 | def pool_parse_html_source(line):
132 |     if (
133 |         not line
134 |         or not line.get("article_body")
135 |         or not line["article_body"].get("html")
136 |         or "wikitable" not in line["article_body"]["html"]
137 |     ):
138 |         return None
139 | 
140 |     if not line.get("main_entity") or not line["main_entity"].get("identifier"):
141 |         return None
142 | 
143 |     wikitables_html = extract_html_tables_from_html(line["article_body"]["html"])
144 |     if not wikitables_html:
145 |         return None
146 |     table_objs = []
147 |     for i, wikitable in enumerate(wikitables_html):
148 |         table_obj = {"index": i}
149 | 
150 |         def update_dict(attr, value):
151 |             if value:
152 |                 table_obj[attr] = value
153 | 
154 |         if line.get("main_entity") and line["main_entity"].get("identifier"):
155 |             update_dict("wikidata", line["main_entity"]["identifier"])
156 |             if not table_obj.get("wikidata"):
157 |                 continue
158 | 
159 |         update_dict("title", line.get("name"))
160 |         update_dict("url", line.get("url"))
161 |         update_dict("html", wikitable.get("html"))
162 |         update_dict("caption", wikitable.get("caption"))
163 |         update_dict("aspects", wikitable.get("aspects"))
164 | 
165 |         # table_obj["html"] = add_css_wikitable(table_obj["html"])
166 | 
167 |         table_objs.append(table_obj)
168 |     return table_objs
169 | 
170 | 
171 | def parse_wikitables(input_file=None):
172 |     dump_file = iw.read_line_from_file(input_file, mode="rb")
173 |     for line in dump_file:
174 |         try:
175 |             line_obj = ujson.loads(line)
176 |             parsed_objs = pool_parse_html_source(line_obj)
177 |             if parsed_objs:
178 |                 yield parsed_objs
179 |         except ValueError:
180 |             continue
181 | 
182 | 
183 | def dump_wikitables(
184 |     lang="ja", input_file=None, outfile=None, limit=0, step=1000, progress=True
185 | ):
186 |     if input_file is None:
187 |         input_file = f"{cf.DIR_DUMPS}/{lang}wiki-NS0-{cf.DUMPS_VERSION_WP_HTML}-ENTERPRISE-HTML.json.tar.gz"
188 |     if not os.path.exists(input_file):
189 |         return
190 | 
191 |     if not outfile:
192 |         if limit:
193 |             outfile = (
194 |                 f"{cf.DIR_MODELS}/wikitables_html_pubtabnet/{lang}_{limit}.jsonl.bz2"
195 |             )
196 |         else:
197 |             outfile = f"{cf.DIR_MODELS}/wikitables_html_pubtabnet/{lang}.jsonl.bz2"
198 |     if os.path.exists(outfile):
199 |         return outfile
200 | 
201 |     iw.create_dir(outfile)
202 | 
203 |     if outfile.endswith(".bz2"):
204 |         jsonFile = bz2.open(outfile, "wt")
205 |     else:
206 |         jsonFile = open(outfile, "w")
207 | 
208 |     parser = parse_wikitables(input_file)
209 |     n = 0
210 |     i = 0
211 | 
212 |     def update_desc(i):
213 |         return f"Parse Wikitable {lang}. Saved {n:,} tables / {i:,} pages"
214 | 
215 |     p_bar = None
216 |     if progress:
217 |         p_bar = tqdm(desc=update_desc(0))
218 | 
219 |     for parsed_objs in parser:
220 |         if limit and n >= limit:
221 |             break
222 |         if progress and i and i % step == 0:
223 |             p_bar.update(step)
224 |             p_bar.set_description(desc=update_desc(i))
225 |         for parsed_obj in parsed_objs:
226 |             n += 1
227 |             jsonString = ujson.dumps(parsed_obj)
228 |             jsonFile.write(jsonString)
229 |             jsonFile.write("\n")
230 |         i += 1
231 |     if progress:
232 |         p_bar.set_description(desc=update_desc(i))
233 |     jsonFile.close()
234 |     return outfile
235 | 
236 | 
237 | def func_modify_table_border(table_obj):
238 |     table_obj["html"] = table_obj["html"].replace("1px solid #a2a9b1", "1")
239 |     return table_obj
240 | 
241 | 
242 | def modify_json_dump(input_folder, func):
243 |     dump_files = iw.get_files_from_dir(input_folder, is_sort=True, reverse=True)
244 |     for dump_file in dump_files:
245 |         file_name = os.path.basename(dump_file).split(".")[0]
246 |         dir_output = dump_file + ".tmp"
247 | 
248 |         if dump_file.endswith(".bz2"):
249 |             output_file = bz2.open(dir_output, "wt")
250 |         else:
251 |             output_file = open(dir_output, "w")
252 | 
253 |         iter_obj = iw.read_json_file(dump_file)
254 |         for table_obj in tqdm(iter_obj, desc=file_name):
255 |             table_obj = func(table_obj)
256 |             output_file.write(ujson.dumps(table_obj))
257 |             output_file.write("\n")
258 | 
259 |         output_file.close()
260 |         iw.delete_file(dump_file)
261 |         os.rename(dir_output, dump_file)
262 | 
263 | 
264 | def read_wikitable_dumps(input_file: str, limit: int = 0):
265 |     for i, table_obj in enumerate(iw.read_json_file(input_file, limit)):
266 |         print(json.dumps(table_obj, indent=2, ensure_ascii=False))
267 | 
268 | 
269 | def get_jsonl_size(input_file: str):
270 |     size = 0
271 |     for _ in iw.read_json_file(input_file):
272 |         size += 1
273 |     return size
274 | 
275 | 
276 | def analyze_wikitables(input_folder: str = cf.DIR_MODELS, limit=0, step=1000):
277 |     """
278 |     Show stats of tables
279 |     """
280 |     dump_files = iw.get_files_from_dir(input_folder, is_sort=True, reverse=False)
281 |     stats = defaultdict()
282 | 
283 |     for dump_file in dump_files:
284 |         file_name = os.path.basename(dump_file).split(".")[0]
285 | 
286 |         n_tables, n_pages, n_caption, n_aspects = 0, 0, 0, 0
287 | 
288 |         def update_desc():
289 |             if n_tables and n_pages:
290 |                 return f"{file_name}. {n_pages:,} pages | {n_tables/n_pages:.2f} tab/page | {n_caption/n_tables*100:.2f}% cap/tab"
291 |             else:
292 |                 return ""
293 | 
294 |         pre_title = None
295 |         p_bar = tqdm(desc=update_desc())
296 |         try:
297 |             iter_obj = iw.read_json_file(dump_file)
298 |             for table_obj in iter_obj:
299 |                 n_tables += 1
300 |                 p_bar.update()
301 |                 if pre_title != table_obj["title"]:
302 |                     n_pages += 1
303 |                     pre_title = table_obj["title"]
304 |                     if n_pages % step == 0:
305 |                         p_bar.set_description(desc=update_desc())
306 | 
307 |                 if table_obj.get("caption"):
308 |                     n_caption += 1
309 | 
310 |                 if table_obj.get("aspects"):
311 |                     n_aspects += 1
312 |                 if limit and n_tables >= 1000:
313 |                     break
314 |             p_bar.set_description(desc=update_desc())
315 |             p_bar.close()
316 |         except EOFError:
317 |             stats[file_name] = [0, 0, 0, 0, 0, 0]
318 |             continue
319 | 
320 |         stats[file_name] = [
321 |             n_pages,
322 |             n_tables,
323 |             n_caption,
324 |             n_aspects,
325 |             n_caption / n_tables * 100 if n_tables else 0,
326 |             n_aspects / n_tables * 100 if n_tables else 0,
327 |         ]
328 | 
329 |     headers = [
330 |         "No",
331 |         "Dump",
332 |         "Pages",
333 |         "Tables",
334 |         "Captions",
335 |         "Aspects",
336 |         "Captions/Table",
337 |         "Aspects/Table",
338 |     ]
339 | 
340 |     iw.print_status("\t".join(headers))
341 |     for i, (file_name, stats_obj) in enumerate(stats.items()):
342 |         iw.print_status(
343 |             f"{i+1}\t{file_name}\t"
344 |             + "\t".join(f"{obj_i:,}" for obj_i in stats_obj[:4])
345 |             + "\t"
346 |             + "\t".join(f"{obj_i:.2f}" for obj_i in stats_obj[4:])
347 |         )
348 | 


--------------------------------------------------------------------------------
/core/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phucty/wtabhtml/f4fecc3dcaaa182cf3f2f5ac53be99c8a6172e92/core/utils/__init__.py


--------------------------------------------------------------------------------
/core/utils/io_worker.py:
--------------------------------------------------------------------------------
  1 | import bz2
  2 | import csv
  3 | import fnmatch
  4 | import gzip
  5 | import logging
  6 | import math
  7 | import os
  8 | import pickle
  9 | import shutil
 10 | import zlib
 11 | 
 12 | import numpy
 13 | import ujson
 14 | import tarfile
 15 | 
 16 | 
 17 | def read_tsv_file_first_col(file_name, encoding):
 18 |     with open(file_name, encoding=encoding) as f:
 19 |         first_col = [l[0].rstrip() for l in csv.reader(f, delimiter="\t")]
 20 |     return first_col
 21 | 
 22 | 
 23 | def read_line_from_file(file_name, mode="r"):
 24 |     if ".bz2" in file_name:
 25 |         reader = bz2.BZ2File(file_name, mode=mode)
 26 |     elif ".gz" in file_name:
 27 |         reader = gzip.open(file_name, mode=mode)
 28 |     else:
 29 |         reader = open(file_name, mode=mode)
 30 |     if reader:
 31 |         for line in reader:
 32 |             yield line
 33 | 
 34 | 
 35 | def print_status(message, is_screen=True, is_log=True) -> object:
 36 |     if isinstance(message, int):
 37 |         message = f"{message:,}"
 38 | 
 39 |     if is_screen:
 40 |         print(message)
 41 |     if is_log:
 42 |         logging.info(message)
 43 | 
 44 | 
 45 | def print_stats_dicts(
 46 |     message, print_obj, is_screen=True, is_log=True, delimiter="\t", header=None
 47 | ):
 48 |     print_status(message)
 49 |     if header:
 50 |         print_status("\t".join(header))
 51 |     for i, (k, v) in enumerate(print_obj.items()):
 52 |         if isinstance(v, list) or isinstance(v, tuple):
 53 |             v = f"{delimiter}".join([str(v_i) for v_i in v])
 54 |         print_status(f"{i + 1}{delimiter}{k}{delimiter}{v}", is_screen, is_log)
 55 | 
 56 | 
 57 | def delete_folder(folder_dir):
 58 |     if os.path.exists(folder_dir):
 59 |         shutil.rmtree(folder_dir, ignore_errors=False)
 60 |     return True
 61 | 
 62 | 
 63 | def delete_file(file_dir):
 64 |     if os.path.exists(file_dir):
 65 |         os.remove(file_dir)
 66 |     return True
 67 | 
 68 | 
 69 | def create_dir(file_dir):
 70 |     folder_dir = os.path.dirname(file_dir)
 71 |     if not os.path.exists(folder_dir):
 72 |         os.makedirs(folder_dir)
 73 | 
 74 | 
 75 | def save_obj_pkl(file_name, save_object, is_compress=False, is_message=True):
 76 |     create_dir(file_name)
 77 |     save_file = file_name
 78 |     if ".pkl" not in file_name:
 79 |         save_file = file_name + ".pkl"
 80 |     if is_compress and ".zlib" not in file_name:
 81 |         save_file += ".zlib"
 82 | 
 83 |     temp_file = save_file + ".temp"
 84 | 
 85 |     # Write temp
 86 |     with open(temp_file, "wb") as fp:
 87 |         if is_compress:
 88 |             save_data = zlib.compress(
 89 |                 pickle.dumps(save_object, pickle.HIGHEST_PROTOCOL)
 90 |             )
 91 |             fp.write(save_data)
 92 |         else:
 93 |             pickle.dump(save_object, fp, pickle.HIGHEST_PROTOCOL)
 94 | 
 95 |     try:
 96 |         if os.path.exists(save_file):
 97 |             os.remove(save_file)
 98 |     except Exception as message:
 99 |         print_status(message)
100 | 
101 |     os.rename(temp_file, save_file)
102 |     if is_message:
103 |         print_status("Saved: - %d - %s" % (len(save_object), save_file), is_log=False)
104 |     return save_file
105 | 
106 | 
107 | def load_obj_pkl(file_name, is_message=False):
108 |     load_obj = None
109 |     if not os.path.exists(file_name) and ".pkl" not in file_name:
110 |         file_name = file_name + ".pkl"
111 | 
112 |     if not os.path.exists(file_name) and ".zlib" not in file_name:
113 |         file_name = file_name + ".zlib"
114 |     with open(file_name, "rb") as fp:
115 |         if ".zlib" in file_name:
116 |             load_obj = pickle.loads(zlib.decompress(fp.read()))
117 |         else:
118 |             load_obj = pickle.load(fp)
119 |     if is_message and load_obj:
120 |         print_status("%d loaded items - %s" % (len(load_obj), file_name))
121 |     return load_obj
122 | 
123 | 
124 | def get_size_of_file(num, suffix="B"):
125 |     """Get human friendly file size
126 |     https://gist.github.com/cbwar/d2dfbc19b140bd599daccbe0fe925597#gistcomment-2845059
127 | 
128 |     Args:
129 |         num (int): Bytes value
130 |         suffix (str, optional): Unit. Defaults to 'B'.
131 | 
132 |     Returns:
133 |         str: file size0
134 |     """
135 |     if num == 0:
136 |         return "0"
137 |     magnitude = int(math.floor(math.log(num, 1024)))
138 |     val = num / math.pow(1024, magnitude)
139 |     if magnitude > 7:
140 |         return "{:3.1f}{}{}".format(val, "Yi", suffix)
141 |     return "{:3.1f}{}{}".format(
142 |         val, ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"][magnitude], suffix
143 |     )
144 | 
145 | 
146 | def load_object_csv(file_name, encoding="utf-8", retries=2):
147 |     content = []
148 |     if os.path.exists(file_name):
149 |         with open(file_name, "r", encoding=encoding) as f:
150 |             reader = csv.reader(f, delimiter=",")
151 |             for r in reader:
152 |                 content.append(r)
153 |     return content
154 | 
155 | 
156 | def save_object_csv(file_name, rows):
157 |     create_dir(file_name)
158 |     temp_file = "%s.temp" % file_name
159 |     with open(temp_file, "w") as f:
160 |         try:
161 |             writer = csv.writer(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_ALL)
162 |             for r in rows:
163 |                 if (
164 |                     isinstance(r, list)
165 |                     or isinstance(r, tuple)
166 |                     or isinstance(r, numpy.ndarray)
167 |                 ):
168 |                     writer.writerow(r)
169 |                 else:
170 |                     writer.writerow([r])
171 |         except Exception as message:
172 |             print(message)
173 |     if os.path.exists(file_name):
174 |         os.remove(file_name)
175 |     os.rename(temp_file, file_name)
176 | 
177 | 
178 | def read_json_file(input_file: str, limit: int = 0):
179 |     if input_file.endswith(".bz2"):
180 |         jsonFile = bz2.BZ2File(input_file)
181 |     else:
182 |         jsonFile = open(input_file, "r")
183 |     i = 0
184 |     limit = int(limit)
185 |     while True:
186 |         line = jsonFile.readline()
187 |         if not line:
188 |             break
189 |         i += 1
190 |         if limit and i > limit:
191 |             break
192 |         table_obj = ujson.loads(line)
193 |         yield table_obj
194 |     jsonFile.close()
195 | 
196 | 
197 | def get_files_from_dir_stream(folder_path, extension="*"):
198 |     for root, _, file_dirs in os.walk(folder_path):
199 |         for file_dir in fnmatch.filter(file_dirs, "*.%s" % extension):
200 |             if ".DS_Store" not in file_dir:
201 |                 yield os.path.join(root, file_dir)
202 | 
203 | 
204 | def get_files_from_dir_subdir(folder_path, extension="*"):
205 |     all_files = []
206 |     for root, _, file_dirs in os.walk(folder_path):
207 |         for file_dir in fnmatch.filter(file_dirs, "*.%s" % extension):
208 |             if ".DS_Store" not in file_dir:
209 |                 all_files.append(os.path.join(root, file_dir))
210 |     return all_files
211 | 
212 | 
213 | def get_files_from_dir(
214 |     folder_path, extension="*", limit_reader=-1, is_sort=False, reverse=False
215 | ):
216 |     all_file_dirs = get_files_from_dir_subdir(folder_path, extension)
217 | 
218 |     if is_sort:
219 |         file_with_size = [(f, os.path.getsize(f)) for f in all_file_dirs]
220 |         file_with_size.sort(key=lambda f: f[1], reverse=reverse)
221 |         all_file_dirs = [f for f, _ in file_with_size]
222 |     if limit_reader < 0:
223 | 
224 |         limit_reader = len(all_file_dirs)
225 |     return all_file_dirs[:limit_reader]
226 | 
227 | 
228 | def compress_folder(input_folder, output_file, delete_org=False):
229 |     if not os.path.exists(input_folder):
230 |         return
231 |     # input_files = get_files_from_dir(input_folder)
232 | 
233 |     with tarfile.open(output_file, "w:bz2") as tar_handle:
234 |         for root, dirs, files in os.walk(input_folder):
235 |             for file in files:
236 |                 tar_handle.add(
237 |                     os.path.join(root, file),
238 |                     arcname=os.path.join(root, file).replace(input_folder, ""),
239 |                 )
240 |         # for file in input_files:
241 |         #     tar.add(file, arcname=os.path.basename(file), recursive=False)
242 |     if delete_org:
243 |         delete_folder(input_folder)
244 | 
245 | 
246 | def merge_jsonl_files(input_folder):
247 |     if not os.path.exists(input_folder):
248 |         return
249 |     output_file = input_folder + ".jsonl.bz2"
250 | 
251 |     file_writer = bz2.open(output_file, "wt")
252 |     n = 0
253 |     input_files = get_files_from_dir(input_folder)
254 |     for input_file in input_files:
255 |         for line in read_json_file(input_file):
256 |             file_writer.write(ujson.dumps(line))
257 |             file_writer.write("\n")
258 |             n += 1
259 | 
260 |     file_writer.close()
261 |     delete_folder(input_folder)
262 |     return n
263 | 


--------------------------------------------------------------------------------
/core/wikitable_to_image.py:
--------------------------------------------------------------------------------
  1 | import bz2
  2 | import json
  3 | import multiprocessing
  4 | import os
  5 | import random
  6 | import re
  7 | import time
  8 | from io import BytesIO
  9 | 
 10 | import cv2
 11 | import numpy as np
 12 | import ujson
 13 | from PIL import Image
 14 | from bs4 import BeautifulSoup
 15 | from selenium import webdriver
 16 | from selenium.webdriver.common.by import By
 17 | from selenium.webdriver.firefox.options import Options
 18 | from selenium.webdriver.support import expected_conditions as EC
 19 | from selenium.webdriver.support.ui import WebDriverWait
 20 | 
 21 | from config import config as cf
 22 | from core.parse_wikitable_html import get_jsonl_size
 23 | from core.utils import io_worker as iw
 24 | from core.utils.io_worker import merge_jsonl_files
 25 | 
 26 | 
 27 | def html_to_img(driver, html_content, id_count):
 28 |     """converts html to image and bounding boxes of each cell"""
 29 |     counter = 1  # This counter is to keep track of the exceptions and stop execution after 10 exceptions have occurred
 30 |     add_border = 2
 31 |     while True:
 32 |         try:
 33 |             driver.get("data:text/html;charset=utf-8," + html_content)
 34 | 
 35 |             el = driver.find_element_by_tag_name("table")
 36 |             png = el.screenshot_as_png
 37 | 
 38 |             im = Image.open(BytesIO(png))
 39 | 
 40 |             table_loc = el.location
 41 | 
 42 |             bboxes = []
 43 |             for id in range(id_count):
 44 |                 # print(id)
 45 |                 e = WebDriverWait(driver, 3).until(
 46 |                     EC.presence_of_element_located((By.ID, str(id)))
 47 |                 )
 48 |                 txt = e.text.strip()
 49 |                 lentext = len(txt)
 50 |                 loc = e.location
 51 |                 size_ = e.size
 52 |                 xmin = loc["x"] - table_loc["x"] - add_border
 53 |                 ymin = loc["y"] - table_loc["y"] - add_border
 54 |                 xmax = int(size_["width"] + xmin) + add_border * 2
 55 |                 ymax = int(size_["height"] + ymin) + add_border * 2
 56 |                 bboxes.append([lentext, txt, xmin, ymin, xmax, ymax])
 57 | 
 58 |             return im, bboxes
 59 |         except Exception as e:
 60 |             counter += 1
 61 |             return None, None
 62 |             # if counter==10:
 63 |             #     return im,None
 64 | 
 65 |             # continue
 66 | 
 67 | 
 68 | def html_string2list(html_string):
 69 |     """this function convert string into list of char and html tag"""
 70 |     list_ = []
 71 |     idx_tag = -1
 72 |     for i, char in enumerate(html_string):
 73 |         if char == "<":
 74 |             idx_tag = i
 75 |         elif idx_tag != -1 and char == ">":
 76 |             html_tag = html_string[idx_tag : i + 1]
 77 | 
 78 |             # ignore comment inside cell content
 79 |             if html_tag.startswith("<!--") or html_tag.startswith("<!["):
 80 |                 idx_tag = -1
 81 |                 continue
 82 | 
 83 |             list_.append(html_tag)
 84 |             idx_tag = -1
 85 |         elif idx_tag == -1:
 86 |             list_.append(char)
 87 | 
 88 |     return list_
 89 | 
 90 | 
 91 | def create_style(border_cat):
 92 |     """This function will dynamically create stylesheet of tables"""
 93 | 
 94 |     style = "<head><style>"
 95 |     style += "html{background-color: white;}table{"
 96 | 
 97 |     # random center align
 98 |     if random.randint(0, 1) == 1:
 99 |         style += "text-align:center;"
100 | 
101 |     style += """border-collapse:collapse;}td,th{padding:6px;padding-left: 6px;padding-right: 6px;"""
102 | 
103 |     if border_cat == 0:
104 |         style += """ border:1px solid black;} """
105 |     elif border_cat == 2:
106 |         style += """border-bottom:1px solid black;}"""
107 |     elif border_cat == 3:
108 |         style += """border-left: 1px solid black;}
109 |                    th{border-bottom: 1px solid black;} table tr td:first-child, 
110 |                    table tr th:first-child {border-left: 0;}"""
111 |     else:
112 |         style += """}"""
113 | 
114 |     style += "</style></head>"
115 |     return style
116 | 
117 | 
118 | def draw_matrices(img, bboxes, output_file_name):
119 |     """ This function draws visualizations of cell bounding boxes on a table image """
120 |     bboxes = bboxes[:, 2:]
121 | 
122 |     img = img.astype(np.uint8)
123 |     img = np.dstack((img, img, img))
124 | 
125 |     im = img.copy()
126 |     pad_ = 3
127 |     for box in bboxes:
128 |         cv2.rectangle(
129 |             im,
130 |             (int(box[0]) + pad_, int(box[1]) + pad_),
131 |             (int(box[2]) - pad_, int(box[3]) - pad_),
132 |             (0, 0, 255),
133 |             1,
134 |         )
135 |     img_name = os.path.join("bboxes/", output_file_name)
136 |     cv2.imwrite(img_name, im)
137 | 
138 | 
139 | def check_int_span(s):
140 |     # check span col/row is between 2~20
141 |     if not s.isdigit():
142 |         return False
143 |     if int(s) < 2 or int(s) > 20:
144 |         return False
145 |     return True
146 | 
147 | 
148 | def convert_html_to_pubtabnet(table):
149 |     soup = BeautifulSoup(table, "html.parser")
150 | 
151 |     tables = soup.find_all("table", {"class": re.compile("wikitable*")})
152 |     for table in tables:
153 |         has_header = False
154 |         end_header = False
155 |         thead = soup.new_tag("thead")
156 |         for i1, tag_1 in enumerate(table):
157 |             if tag_1.name != "tbody":
158 |                 continue
159 |             # tbody ta
160 |             for i2, tag2 in enumerate(tag_1):
161 |                 if tag2.name != "tr":
162 |                     continue
163 |                 if not end_header and all(
164 |                     (col.name in ["th", None] and col.name not in ["td"])
165 |                     for col in tag2
166 |                 ):
167 |                     tag2.extract()
168 |                     thead.append(tag2)
169 |                     has_header = True
170 |                 else:
171 |                     end_header = True
172 |         if has_header:
173 |             table.insert(0, thead)
174 |         table.attrs["border"] = 1
175 | 
176 |         def filter_attr(bs_obj, white_tags):
177 |             bs_obj.attrs = {
178 |                 attr: v for attr, v in bs_obj.attrs.items() if attr in white_tags
179 |             }
180 | 
181 |         filter_attr(table, ["border", "cellpadding", "style"])
182 |         for a in table.findAll(True):
183 |             filter_attr(a, ["colspan", "headers", "rowspan", "cellpadding", "style"])
184 | 
185 |         for tag in ["a", "span", "link", "img", "div"]:
186 |             for a in table.findAll(tag):
187 |                 a.unwrap()
188 | 
189 |         for tag in ["sup"]:
190 |             for a in table.findAll(tag):
191 |                 a.extract()
192 |         return str(table)
193 | 
194 | 
195 | def transform_html_id_text(html_input):
196 |     """
197 |     change all <th> tag inside <thead> to <td> tag
198 |     add <span id=''> to content of <td> tag (generate location of cell content)
199 |     """
200 | 
201 |     # html_input = convert_html_to_pubtabnet(html_input)
202 |     # html_input = html_input.replace("1px solid #a2a9b1", "1")
203 |     html = """<html>"""
204 |     html += create_style(1)
205 |     html += """<body>"""
206 |     html += html_input
207 |     html += """</body></html>"""
208 | 
209 |     html = html.replace("> </td>", "></td>")
210 |     html = html.replace("> </th>", "></th>")
211 |     html = html.replace("\n", "")
212 |     # html = html.replace('border="1"', '')
213 | 
214 |     # print(html)
215 | 
216 |     idx_count = 0
217 | 
218 |     table_ = BeautifulSoup(html, "lxml")
219 | 
220 |     struc_tokens = []
221 |     list_cell_contents = []
222 | 
223 |     # ############ Remove caption ##############
224 |     # will be changed to remain the table title in a table image
225 |     caption_ = table_.find_all("caption")
226 |     if len(caption_) > 0:
227 |         for cap_ in caption_:
228 |             cap_.string = ""
229 | 
230 |     # #################get thead and tbody#################
231 |     thead = table_.find_all("thead")
232 |     if len(thead) != 1:
233 |         return None, None, None, idx_count
234 | 
235 |     tbody = table_.find_all("tbody")
236 |     if len(tbody) != 1:
237 |         return None, None, None, idx_count
238 | 
239 |     thead_tbody = thead + tbody
240 |     for tag_ in thead_tbody:
241 |         if tag_.name == "thead":
242 |             struc_tokens.append("<thead>")
243 |         else:
244 |             struc_tokens.append("<tbody>")
245 | 
246 |         #  get tr and td
247 |         for tr in tag_.find_all("tr"):
248 |             if len(tr.find_all("td") + tr.find_all("th")) == 0:
249 |                 continue
250 |             struc_tokens.append("<tr>")
251 | 
252 |             for td in tr.find_all("td") + tr.find_all("th"):
253 |                 if td.find_all("table"):
254 |                     # if there is a table inside the cell, then ignore this pattern
255 |                     return None, None, None, idx_count
256 |                 if len(td.contents) == 0:
257 |                     list_cell_contents.append([])
258 |                     struc_tokens.append("<td>")
259 |                     struc_tokens.append("</td>")
260 |                     continue
261 |                 if td.text.strip() == "":
262 |                     list_cell_contents.append([])
263 |                     struc_tokens.append("<td>")
264 |                     struc_tokens.append("</td>")
265 |                     continue
266 | 
267 |                 # print(''.join(str(el) for el in td.contents))
268 |                 # print(html_string2list(''.join(str(el) for el in td.contents)))
269 | 
270 |                 # store the content of this cell
271 |                 list_cell_contents.append(
272 |                     html_string2list("".join(str(el) for el in td.contents))
273 |                 )
274 |                 # add <span id=''> to content of <td> tag to generate location of cell content
275 |                 td.string = (
276 |                     "<span id="
277 |                     + str(idx_count)
278 |                     + ">"
279 |                     + "".join(str(el) for el in td.contents)
280 |                     + "</span>"
281 |                 )
282 |                 idx_count = idx_count + 1
283 | 
284 |                 if (not td.has_attr("colspan")) and (not td.has_attr("rowspan")):
285 |                     struc_tokens.append("<td>")
286 |                     struc_tokens.append("</td>")
287 |                 else:
288 |                     struc_tokens.append("<td")
289 |                     if td.has_attr("colspan"):
290 |                         if not check_int_span(td["colspan"]):
291 |                             return None, None, None, idx_count
292 | 
293 |                         struc_tokens.append(' colspan="' + td["colspan"] + '"')
294 |                     if td.has_attr("rowspan"):
295 |                         if not check_int_span(td["rowspan"]):
296 |                             return None, None, None, idx_count
297 | 
298 |                         struc_tokens.append(' rowspan="' + td["rowspan"] + '"')
299 | 
300 |                     struc_tokens.append(">")
301 |                     struc_tokens.append("</td>")
302 | 
303 |             struc_tokens.append("</tr>")
304 | 
305 |         if tag_.name == "thead":
306 |             struc_tokens.append("</thead>")
307 |         else:
308 |             struc_tokens.append("</tbody>")
309 | 
310 |     return struc_tokens, table_.prettify(formatter=None), list_cell_contents, idx_count
311 | 
312 | 
313 | def get_chunks(start_id, end_id, n_chunks=1):
314 |     if end_id < 0:
315 |         raise ValueError()
316 | 
317 |     counts = end_id - start_id + 1
318 |     nums_per_chunk = counts // n_chunks
319 |     chunks = []
320 |     for n in range(n_chunks):
321 |         if n == n_chunks - 1:
322 |             chunks.append([n * nums_per_chunk, end_id])
323 |         else:
324 |             chunks.append([n * nums_per_chunk, (n + 1) * nums_per_chunk])
325 |     return chunks
326 | 
327 | 
328 | def parse_wiki_tables_mp(input_file, error_file, save_dir, split_name, chunks):
329 |     """
330 |         multiprocessing to parse raw data.
331 |         One process to do one chunk parsing.
332 |         :param table_chunks:
333 |         :return:
334 |         """
335 |     # self.read_json_tables(table_chunks[1], 1)
336 | 
337 |     p = multiprocessing.Pool(len(chunks))
338 |     for chunk in chunks:
339 |         args = (input_file, error_file, save_dir, split_name, chunk)
340 |         p.apply_async(read_json_tables, args)
341 |     p.close()
342 |     p.join()
343 | 
344 | 
345 | def read_json_tables(input_file, error_file, save_dir, split_name, chunk):
346 |     """
347 |         Read Japanese Wikipedia tables from one table chunk.
348 |         :param: this_chunk
349 |         :param: chunks_idx
350 |         """
351 | 
352 |     opts = Options()
353 |     opts.add_argument("--headless")
354 | 
355 |     driver = webdriver.Firefox(options=opts)
356 | 
357 |     if input_file.endswith(".bz2"):
358 |         input_file_reader = bz2.BZ2File(input_file)
359 |     else:
360 |         input_file_reader = open(input_file, "r")
361 | 
362 |     start_id, end_id = chunk
363 | 
364 |     error_file = f"{error_file}{start_id}_{end_id}.jsonl.bz2"
365 | 
366 |     if error_file.endswith(".bz2"):
367 |         errors_file_writer = bz2.open(error_file, "wt")
368 |     else:
369 |         errors_file_writer = open(error_file, "w")
370 | 
371 |     i = 0
372 |     while True:
373 |         line = input_file_reader.readline()
374 |         if not line:
375 |             break
376 |         i += 1
377 |         if i < chunk[0] or i >= chunk[1]:
378 |             continue
379 | 
380 |         table_obj = ujson.loads(line)
381 |         # iw.print_status(
382 |         #     "[%d, %d] | Table:%d | WD:%s | Index:%d | URL:%s"
383 |         #     % (
384 |         #         chunk[0], chunk[1],
385 |         #         i,
386 |         #         str(table_obj.get("wikidata")),
387 |         #         table_obj["index"],
388 |         #         str(table_obj.get("url")),
389 |         #     ),
390 |         #     is_screen=False,
391 |         # )
392 | 
393 |         (
394 |             struc_tokens,
395 |             html_with_id,
396 |             list_cell_contents,
397 |             idx_count,
398 |         ) = transform_html_id_text(table_obj["html"])
399 | 
400 |         if struc_tokens is None:
401 |             # save error patterns to folder
402 |             errors_file_writer.write(ujson.dumps(table_obj))
403 |             errors_file_writer.write("\n")
404 |             continue
405 | 
406 |         im, bboxes = html_to_img(driver, html_with_id, idx_count)
407 |         if bboxes is None:
408 |             # save error patterns to folder
409 |             errors_file_writer.write(ujson.dumps(table_obj))
410 |             errors_file_writer.write("\n")
411 |             continue
412 |         # Save photo
413 |         dir_sample = f"{save_dir}/{i}"
414 |         im.save(f"{dir_sample}.png", dpi=(600, 600))
415 | 
416 |         # Save ground truth json
417 |         cells = []
418 |         idx_ = 0
419 |         for cell_token_ in list_cell_contents:
420 |             if len(cell_token_) == 0:
421 |                 cell_ = {"tokens": cell_token_}
422 |             else:
423 |                 cell_ = {"tokens": cell_token_, "bbox": bboxes[idx_][2:]}
424 |                 idx_ += 1
425 | 
426 |             cells.append(cell_)
427 | 
428 |         html_json = {"structure": {"tokens": struc_tokens}, "cells": cells}
429 | 
430 |         # save to folder
431 |         table_sample = {
432 |             "filename": str(i) + ".png",
433 |             "split": split_name,
434 |             "imgid": i,
435 |             "html": html_json,
436 |         }
437 | 
438 |         with open(f"{dir_sample}.json", "w") as s_json:
439 |             json.dump(table_sample, s_json)
440 | 
441 |         # # ##########debug
442 |         # with open('bboxes/' + str(i) + '.txt', 'w') as f:
443 |         #     f.write(html_with_id)
444 |         #     f.write(str(table_sample))
445 |         #
446 |         # img = np.asarray(im, np.int64)[:, :, 0]
447 |         # draw_matrices(img, np.array(bboxes), str(i) + '.jpg')
448 |         # # #########################
449 | 
450 |     driver.quit()
451 | 
452 | 
453 | def gen_images(
454 |     wikipedia_version=cf.DUMPS_VERSION_WP_HTML,
455 |     lang="ja",
456 |     start_id=None,
457 |     end_id=None,
458 |     n_threads=8,
459 |     split_name="train",
460 |     compress=False,
461 |     delete_org=False,
462 | ):
463 |     start = time.time()
464 |     input_file = f"{cf.DIR_MODELS}/wikitables_html_pubtabnet/{lang}.jsonl.bz2"
465 |     if not os.path.exists(input_file):
466 |         iw.print_status(f"Missing jsonl file: {input_file}")
467 |         from cli.pipeline import run_dump
468 | 
469 |         run_dump(
470 |             wikipedia_version=wikipedia_version, language=lang, n_threads=n_threads,
471 |         )
472 |         if not os.path.exists(input_file):
473 | 
474 |             return 0, 0, 0
475 |     save_root = f"{cf.DIR_MODELS}/wikitables_images/"
476 |     save_lang = f"{save_root}{lang}/"
477 |     save_split_name = f"{save_lang}{split_name}/"
478 |     save_errors = f"{save_lang}errors/"
479 |     iw.create_dir(save_root)
480 |     iw.create_dir(save_lang)
481 |     iw.create_dir(save_split_name)
482 |     iw.create_dir(save_errors)
483 | 
484 |     if start_id is None and end_id is None:
485 |         end_id = get_jsonl_size(input_file)
486 |         start_id = 0
487 | 
488 |     chunks = get_chunks(start_id=start_id, end_id=end_id, n_chunks=n_threads)
489 | 
490 |     parse_wiki_tables_mp(
491 |         input_file=input_file,
492 |         error_file=save_errors,
493 |         save_dir=save_split_name,
494 |         split_name=split_name,
495 |         chunks=chunks,
496 |     )
497 | 
498 |     n_errors = merge_jsonl_files(save_errors[:-1])
499 |     n_images = len(os.listdir(save_split_name)) // 2
500 | 
501 |     if compress:
502 |         output_file_compressed = f"{save_root}{lang}.tar.bz2"
503 |         iw.compress_folder(
504 |             input_folder=save_lang,
505 |             output_file=output_file_compressed,
506 |             delete_org=delete_org,
507 |         )
508 |         iw.print_status(f"Output file: {output_file_compressed}")
509 |     else:
510 |         iw.print_status(f"Output dataset folder: {save_lang}")
511 |     return n_errors, n_images, time.time() - start
512 | 


--------------------------------------------------------------------------------
/data/dump/crwiki-NS0-20220301-ENTERPRISE-HTML.json.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phucty/wtabhtml/f4fecc3dcaaa182cf3f2f5ac53be99c8a6172e92/data/dump/crwiki-NS0-20220301-ENTERPRISE-HTML.json.tar.gz


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Automatically generated by https://github.com/damnever/pigar.
 2 | 
 3 | # wtabhtml/core/parse_wikitable_html.py: 5
 4 | bs4 == 0.0.1
 5 | 
 6 | # wtabhtml/cli/__init__.py: 1
 7 | # wtabhtml/cli/downloader.py: 1
 8 | # wtabhtml/cli/parser.py: 1
 9 | # wtabhtml/cli/reader.py: 1
10 | click == 8.1.3
11 | 
12 | # wtabhtml/core/utils/io_worker.py: 11
13 | numpy == 1.22.4
14 | 
15 | # wtabhtml/core/downloader.py: 6
16 | requests == 2.27.1
17 | 
18 | # wtabhtml/core/downloader.py: 1
19 | # wtabhtml/core/parse_wikitable_html.py: 7
20 | tqdm == 4.64.0
21 | 
22 | # wtabhtml/core/parse_wikitable_html.py: 6
23 | # wtabhtml/core/utils/io_worker.py: 12
24 | ujson == 5.3.0
25 | 
26 | zlib~=1.2.12
27 | opencv-python~=4.6.0.66
28 | selenium~=4.2.0
29 | beautifulsoup4~=4.11.1
30 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | from cli import pipeline
 2 | from config import config as cf
 3 | from core import downloader
 4 | from core import parse_wikitable_html
 5 | from core.parse_wikitable_html import (
 6 |     analyze_wikitables,
 7 |     modify_json_dump,
 8 |     func_modify_table_border,
 9 | )
10 | from core.utils.io_worker import merge_jsonl_files
11 | from core.wikitable_to_image import gen_images
12 | 
13 | 
14 | def run_pipeline_dumps_to_jsonl_bz2(wikipedia_version, langs):
15 |     for lang in langs:
16 |         # 1. Download Wikipedia HTML dump
17 |         downloaded_file = downloader.download_wikipedia_html_dump(
18 |             wikipedia_version, lang
19 |         )
20 |         # 2. Parse HTML dump
21 |         dump_file = parse_wikitable_html.dump_wikitables(
22 |             lang=lang, input_file=downloaded_file
23 |         )
24 |         # 3. Read the first three tables
25 |         # parse_wikitable_html.read_wikitable_dumps(dump_file, limit=0)
26 |     return
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     # pipeline.run_dump(cf.DUMPS_VERSION_WP_HTML, "cr", 1)
31 |     # run_pipeline_dumps_to_jsonl_bz2(cf.DUMPS_VERSION_WP_HTML, cf.LANGS)
32 |     # analyze_wikitables(cf.DIR_MODELS + "1")
33 |     gen_images(lang="cr", n_threads=5)
34 |     # convert_dumps_to_PubTabnet(cf.DIR_MODELS)
35 |     # modify_json_dump(
36 |     #     "/Users/phucnguyen/git/wtabhtml/data/models/wikitables_html",
37 |     #     func=func_modify_table_border,
38 |     # )
39 |     # merge_jsonl_files(
40 |     #     "/Users/phucnguyen/git/wtabhtml/data/models/wikitables_images/cr/errors"
41 |     # )
42 | 


--------------------------------------------------------------------------------
/wtabhtml.py:
--------------------------------------------------------------------------------
1 | import cli
2 | 
3 | if __name__ == "__main__":
4 |     cli.cli_wikitables()
5 | 


--------------------------------------------------------------------------------