├── .gitignore
├── LICENSE
├── README.md
├── alphabet
    ├── fintabnet
    │   ├── character_alphabet.txt
    │   └── structure_alphabet.txt
    └── pubtabnet
    │   ├── character_alphabet.txt
    │   └── structure_alphabet.txt
├── configs
    ├── fintabnet.py
    ├── pubtab250.py
    ├── pubtabfin.py
    └── pubtabnet.py
├── datasets
    ├── FinTabNet.yaml
    ├── FinTabSub.yaml
    ├── PubTab250.yaml
    ├── PubTabNet.yaml
    └── PubTabSub.yaml
├── mutab
    ├── __init__.py
    ├── apis
    │   ├── __init__.py
    │   ├── test.py
    │   └── train.py
    ├── datasets
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── loader.py
    │   └── pipeline.py
    ├── metrics
    │   ├── __init__.py
    │   ├── metric.py
    │   ├── sample_pred.json
    │   ├── sample_real.json
    │   └── sample_test.json
    ├── models
    │   ├── __init__.py
    │   ├── backbone.py
    │   ├── decoder.py
    │   ├── encoder.py
    │   ├── factory.py
    │   ├── handler.py
    │   ├── loss.py
    │   ├── revisor.py
    │   └── scanner.py
    ├── optimizer
    │   ├── __init__.py
    │   └── factory.py
    └── utils
    │   ├── __init__.py
    │   └── utils.py
├── pyproject.toml
├── test.py
├── train.py
└── train.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | .*
 2 | __pycache__/
 3 | *.egg-info/
 4 | *.egg
 5 | *.pth
 6 | *.xz
 7 | build/
 8 | !.gitignore
 9 | !.github/
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 JG1VPP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MuTabNet
  2 | 
  3 | End-to-End table OCR model using a hierarchical Transformer that outputs HTML tags and cell contents.
  4 | 
  5 | ## Usage
  6 | 
  7 | ### Install
  8 | 
  9 | ```sh
 10 | pip install -e .
 11 | ```
 12 | 
 13 | ### Datasets
 14 | 
 15 | Download the following datasets:
 16 | 
 17 | - [FinTabNet](https://developer.ibm.com/data/fintabnet)
 18 | - [PubTabNet](https://developer.ibm.com/exchanges/data/all/pubtabnet)
 19 | - [ICDAR Task-B Test Data](https://github.com/ajjimeno/icdar-task-b)
 20 | 
 21 | ### Preprocess
 22 | 
 23 | Follow [MTL-TabNet instructions](https://github.com/namtuanly/MTL-TabNet#data-preprocess).
 24 | The datasets must be placed in `data` directory as follows:
 25 | 
 26 | ```sh
 27 | $ ls ~/data
 28 | fintabnet/
 29 |   img_tables/
 30 |     train/
 31 |       100000_61623.png
 32 |       100001_61624.png
 33 |       100002_61625.png
 34 |       100003_61626.png
 35 |       100004_61627.png
 36 |     val/
 37 | ground_truth_fintabnet.json
 38 | ground_truth_pubtabnet.json
 39 | icdar-task-b/
 40 |   final_eval/
 41 |     000221630ba33f9118f2671a715d6962e08d6b76a5a0c77a9fe26c291df763b0.png
 42 |     0005e8fe1b3ba14982336837219f285921af7c152cfc81ac88bcf52809299279.png
 43 |     002b1bf2bbb7dd7ec6201174e68df6346f448cd3951e861c3f940711c769f25f.png
 44 |     002bfeebe20be2e97fab46b99ce68321afb8972f6d8f131f0c1f5392819d3a23.png
 45 |     002c7215e95cd4bfebffb13dc0db32ab229a6674f4f1add84518ae52b75ac0da.png
 46 |   final_eval.json
 47 | mmocr_fintabnet/
 48 |   train/
 49 |     100000_61623.txt
 50 |     100001_61624.txt
 51 |     100002_61625.txt
 52 |     100003_61626.txt
 53 |     100004_61627.txt
 54 |   val/
 55 | mmocr_pubtabnet/
 56 |   train/
 57 |     PMC1064074_007_00.txt
 58 |     PMC1064076_003_00.txt
 59 |     PMC1064076_004_00.txt
 60 |     PMC1064080_002_00.txt
 61 |     PMC1064094_007_00.txt
 62 |   val/
 63 | pubtabnet/
 64 |   PubTabNet_2.0.0.jsonl
 65 |   train/
 66 |     PMC1064074_007_00.png
 67 |     PMC1064076_003_00.png
 68 |     PMC1064076_004_00.png
 69 |     PMC1064080_002_00.png
 70 |     PMC1064094_007_00.png
 71 |   val/
 72 | ```
 73 | 
 74 | ### Training
 75 | 
 76 | Run the following command to start training using four GPUs:
 77 | 
 78 | ```sh
 79 | name=pubtab250
 80 | save=~/work/$name
 81 | 
 82 | CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 ./train.sh ./configs/$name.py $save 4
 83 | ```
 84 | 
 85 | ### Evaluation
 86 | 
 87 | Run the following command to evaluate the model and calculate TEDS score:
 88 | 
 89 | ```sh
 90 | path=~/data/icdar-task-b/final_eval
 91 | json=~/data/icdar-task-b/final_eval.json
 92 | 
 93 | python test.py --conf ./configs/$name.py --ckpt $save/latest.pth --path $path --json $json
 94 | ```
 95 | 
 96 | For FinTabNet, we use validation set including 10,656 tables as test set in imitation of the previous work.
 97 | 
 98 | ## Requirements
 99 | 
100 | We recommend that you use at least four V100 32GB GPUs or two A100 80GB GPU.
101 | 
102 | ## License
103 | 
104 | This project is licensed under the MIT License.
105 | See LICENSE for more details.
106 | 
107 | ## Citation
108 | 
109 | ```latex
110 | @inproceedings{ICDAR24KAT,
111 |   author={Takaya Kawakatsu},
112 |   title={Multi-Cell Decoder and Mutual Learning for Table Structure and Character Recognition},
113 |   booktitle={Document Analysis and Recognition - ICDAR 2024},
114 |   publisher={Springer Nature Switzerland},
115 |   year={2024},
116 |   pages={389--405},
117 | }
118 | ```
119 | 


--------------------------------------------------------------------------------
/alphabet/fintabnet/character_alphabet.txt:
--------------------------------------------------------------------------------
  1 | L
  2 | o
  3 | c
  4 | a
  5 | t
  6 | i
  7 | n
  8 |  
  9 | C
 10 | u
 11 | r
 12 | y
 13 | S
 14 | e
 15 | q
 16 | F
 17 | (
 18 | h
 19 | s
 20 | d
 21 | )
 22 | 1
 23 | ,
 24 | 0
 25 | 9
 26 | 8
 27 | 5
 28 | M
 29 | 6
 30 | P
 31 | R
 32 | 3
 33 | x
 34 | 7
 35 | 2
 36 | l
 37 | f
 38 | 4
 39 | I
 40 | T
 41 | D
 42 | m
 43 | p
 44 | b
 45 | A
 46 | z
 47 | w
 48 | /
 49 | .
 50 | $
 51 | &
 52 | H
 53 | E
 54 | Y
 55 | <sup>
 56 | </sup>
 57 | O
 58 | g
 59 | :
 60 | N
 61 | v
 62 | -
 63 | —
 64 | G
 65 | %
 66 | B
 67 | W
 68 | k
 69 | ’
 70 | U
 71 | V
 72 | j
 73 | J
 74 | K
 75 | –
 76 | Q
 77 | *
 78 | '
 79 | +
 80 | X
 81 | ;
 82 | “
 83 | ”
 84 | Z
 85 | ≥
 86 | <
 87 | =
 88 | #
 89 | <i>
 90 | </i>
 91 | <sub>
 92 | </sub>
 93 | "
 94 | ¢
 95 | ó
 96 | [
 97 | ]
 98 | í
 99 | á
100 | ¨
101 | ™
102 | ¥
103 | `
104 | ö
105 | ü
106 | †
107 | é
108 | ¸
109 | ý
110 | ‑
111 | ®
112 | þ
113 | §
114 | ‘
115 | ñ
116 | ½
117 | !
118 | >
119 | ⁄
120 | ©
121 | ☑
122 | ☐
123 | ?
124 | €
125 | £
126 | ‡
127 | ⅞
128 | ☒
129 | ē
130 | −
131 | _
132 | ¼
133 | ¾
134 | @
135 | À
136 | à
137 | ~
138 | \
139 | }
140 | ●
141 | ·
142 | ä
143 | ¤
144 | •
145 | ç
146 | ã
147 | √
148 |  
149 | Ÿ
150 | ú
151 | ˆ
152 | ≤
153 | ï
154 | ­
155 | 
156 | …
157 | ê
158 | ô
159 | ―
160 | ^
161 | İ
162 | Ş
163 | è
164 | ²
165 | č
166 | ë
167 | ∙
168 | È
169 | ﻿
170 | ³
171 | ø
172 | å
173 | ¹
174 | ō
175 | ×
176 | 


--------------------------------------------------------------------------------
/alphabet/fintabnet/structure_alphabet.txt:
--------------------------------------------------------------------------------
 1 | <table>
 2 | </table>
 3 | </tr>
 4 | <tr>
 5 | <td
 6 | >
 7 | </td>
 8 | <td></td>
 9 |  colspan="10"
10 |  colspan="11"
11 |  colspan="12"
12 |  colspan="13"
13 |  colspan="14"
14 |  colspan="15"
15 |  colspan="16"
16 |  colspan="18"
17 |  colspan="19"
18 |  colspan="2"
19 |  colspan="25"
20 |  colspan="3"
21 |  colspan="4"
22 |  colspan="5"
23 |  colspan="6"
24 |  colspan="7"
25 |  colspan="8"
26 |  colspan="9"
27 | <eb></eb>
28 |  rowspan="10"
29 |  rowspan="13"
30 |  rowspan="15"
31 |  rowspan="16"
32 |  rowspan="2"
33 |  rowspan="3"
34 |  rowspan="4"
35 |  rowspan="5"
36 |  rowspan="6"
37 |  rowspan="7"
38 |  rowspan="8"
39 |  rowspan="9"
40 | 


--------------------------------------------------------------------------------
/alphabet/pubtabnet/character_alphabet.txt:
--------------------------------------------------------------------------------
  1 | V
  2 | a
  3 | r
  4 | i
  5 | b
  6 | l
  7 | e
  8 | H
  9 | z
 10 | d
 11 |  
 12 | t
 13 | o
 14 | 9
 15 | 5
 16 | %
 17 | C
 18 | I
 19 | <i>
 20 | p
 21 | </i>
 22 | v
 23 | u
 24 | *
 25 | A
 26 | g
 27 | (
 28 | m
 29 | n
 30 | )
 31 | 0
 32 | .
 33 | 7
 34 | 1
 35 | 6
 36 | ≤
 37 | >
 38 | 8
 39 | 3
 40 | –
 41 | 2
 42 | G
 43 | 4
 44 | M
 45 | F
 46 | T
 47 | y
 48 | f
 49 | s
 50 | L
 51 | w
 52 | c
 53 | U
 54 | h
 55 | D
 56 | S
 57 | Q
 58 | R
 59 | x
 60 | P
 61 | -
 62 | E
 63 | O
 64 | /
 65 | k
 66 | ,
 67 | +
 68 | N
 69 | K
 70 | q
 71 | ′
 72 | [
 73 | ]
 74 | <
 75 | ≥
 76 | <sup>
 77 | −
 78 | </sup>
 79 | μ
 80 | ±
 81 | J
 82 | j
 83 | W
 84 | _
 85 | Δ
 86 | B
 87 | “
 88 | :
 89 | Y
 90 | α
 91 | λ
 92 | ;
 93 | <sub>
 94 | </sub>
 95 | ?
 96 | ∼
 97 | <b>
 98 | </b>
 99 | =
100 | °
101 | #
102 | ̊
103 | ̈
104 | ̂
105 | ’
106 | Z
107 | X
108 | ∗
109 | —
110 | β
111 | '
112 | †
113 | ~
114 | @
115 | "
116 | γ
117 | ↓
118 | ↑
119 | &
120 | ‡
121 | χ
122 | ”
123 | σ
124 | §
125 | |
126 | ¶
127 | ‐
128 | ×
129 | $
130 | →
131 | √
132 | ✓
133 | ‘
134 | \
135 | ∞
136 | π
137 | •
138 | ®
139 | ^
140 | ∆
141 | ≧
142 | <underline>
143 | </underline>
144 | ́
145 | ♀
146 | ♂
147 | ‒
148 | ⁎
149 | ▲
150 | ·
151 | £
152 | φ
153 | Ψ
154 | ß
155 | △
156 | ☆
157 | ▪
158 | η
159 | €
160 | ∧
161 | ̃
162 | Φ
163 | ρ
164 | ̄
165 | δ
166 | ‰
167 | ̧
168 | Ω
169 | ♦
170 | {
171 | }
172 | ̀
173 | ∑
174 | ∫
175 | ø
176 | κ
177 | ε
178 | ¥
179 | ※
180 | `
181 | ω
182 | Σ
183 | ➔
184 | ‖
185 | Β
186 | ̸
187 | ─
188 | ●
189 | ⩾
190 | Χ
191 | Α
192 | ⋅
193 | ◆
194 | ★
195 | ■
196 | ψ
197 | ǂ
198 | □
199 | ζ
200 | !
201 | Γ
202 | ↔
203 | θ
204 | ⁄
205 | 〈
206 | 〉
207 | ―
208 | υ
209 |  
210 | τ
211 | ⋆
212 | Ø
213 | ©
214 | ∥
215 | С
216 | ˂
217 | ➢
218 | ɛ
219 | ⁡
220 | ✗
221 | ←
222 | ○
223 | ¢
224 | ⩽
225 | ∖
226 | ˃
227 | ­
228 | ≈
229 | Π
230 | ̌
231 | ≦
232 | ∅
233 | ᅟ
234 | <overline>
235 | </overline>
236 | ∣
237 | ¤
238 | ♯
239 | ̆
240 | ξ
241 | ÷
242 | ▼
243 | ﻿
244 | ι
245 | ν
246 | ║
247 | <strike>
248 | </strike>
249 | ◦
250 | ​
251 | ◊
252 | ∙
253 | «
254 | »
255 | ł
256 | ı
257 | Θ
258 | ∈
259 | „
260 | ∘
261 | ✔
262 | ̇
263 | æ
264 | ʹ
265 | ˆ
266 | ♣
267 | ⇓
268 | ∩
269 | ⊕
270 | ⇒
271 | ⇑
272 | ̨
273 | Ι
274 | Λ
275 | ⋯
276 | А
277 | ⋮
278 | 


--------------------------------------------------------------------------------
/alphabet/pubtabnet/structure_alphabet.txt:
--------------------------------------------------------------------------------
 1 | <thead>
 2 | </thead>
 3 | <tbody>
 4 | </tbody>
 5 | <tr>
 6 | </tr>
 7 | <td
 8 | >
 9 | </td>
10 | <td></td>
11 | <eb></eb>
12 | <eb1></eb1>
13 | <eb2></eb2>
14 | <eb3></eb3>
15 | <eb4></eb4>
16 | <eb5></eb5>
17 | <eb6></eb6>
18 | <eb7></eb7>
19 | <eb8></eb8>
20 | <eb9></eb9>
21 | <eb10></eb10>
22 |  colspan="2"
23 |  colspan="3"
24 |  colspan="4"
25 |  colspan="5"
26 |  colspan="6"
27 |  colspan="7"
28 |  colspan="8"
29 |  colspan="9"
30 |  colspan="10"
31 |  rowspan="2"
32 |  rowspan="3"
33 |  rowspan="4"
34 |  rowspan="5"
35 |  rowspan="6"
36 |  rowspan="7"
37 |  rowspan="8"
38 |  rowspan="9"
39 |  rowspan="10"
40 | 


--------------------------------------------------------------------------------
/configs/fintabnet.py:
--------------------------------------------------------------------------------
 1 | _base_ = "pubtabnet.py"
 2 | 
 3 | 
 4 | model = dict(
 5 |     handler=dict(
 6 |         html_dict_file="alphabet/fintabnet/structure_alphabet.txt",
 7 |         cell_dict_file="alphabet/fintabnet/character_alphabet.txt",
 8 |     )
 9 | )
10 | 
11 | train_pipeline = [
12 |     dict(type="LoadImageFromFile"),
13 |     dict(type="TableResize", size=520),
14 |     dict(
15 |         type="TablePad",
16 |         size=(520, 520),
17 |     ),
18 |     dict(type="TableBboxFlip"),
19 |     dict(type="TableBboxEncode"),
20 |     dict(type="ToTensorOCR"),
21 |     dict(
22 |         type="NormalizeOCR",
23 |         mean=[0.5, 0.5, 0.5],
24 |         std=[0.5, 0.5, 0.5],
25 |     ),
26 |     dict(
27 |         type="Collect",
28 |         keys=["img"],
29 |         meta_keys=[
30 |             "filename",
31 |             "ori_shape",
32 |             "img_shape",
33 |             "pad_shape",
34 |             "img_scale",
35 |             "html",
36 |             "cell",
37 |             "bbox",
38 |         ],
39 |     ),
40 | ]
41 | 
42 | data = dict(
43 |     train=dict(
44 |         img_prefix="../data/fintabnet/img_tables/train/",
45 |         ann_file="../data/mmocr_fintabnet/train/",
46 |         pipeline=train_pipeline,
47 |     ),
48 |     val=dict(
49 |         img_prefix="../data/fintabnet/img_tables/test/",
50 |         ann_file="../data/mmocr_fintabsub/test/",
51 |         pipeline=train_pipeline,
52 |     ),
53 |     test=dict(
54 |         img_prefix="../data/fintabnet/img_tables/val/",
55 |         ann_file="../data/mmocr_fintabsub/val/",
56 |     ),
57 | )
58 | 


--------------------------------------------------------------------------------
/configs/pubtab250.py:
--------------------------------------------------------------------------------
1 | _base_ = "pubtabnet.py"
2 | 
3 | 
4 | data = dict(train=dict(ann_file="../data/mmocr_pubtab250/train/"))
5 | 


--------------------------------------------------------------------------------
/configs/pubtabfin.py:
--------------------------------------------------------------------------------
1 | _base_ = "pubtabnet.py"
2 | 
3 | 
4 | template = "<html><body><table>{}</table></body></html>"
5 | 
6 | model = dict(handler=dict(revisor=dict(template=template)))
7 | 
8 | ignore = ["b"]  # in all <td></td> elements
9 | 


--------------------------------------------------------------------------------
/configs/pubtabnet.py:
--------------------------------------------------------------------------------
  1 | max_len_html = 800
  2 | max_len_cell = 8000
  3 | 
  4 | seed = None
  5 | 
  6 | eb_tokens = {
  7 |     "<eb></eb>": "<td></td>",
  8 |     "<eb1></eb1>": "<td> </td>",
  9 |     "<eb2></eb2>": "<td><b> </b></td>",
 10 |     "<eb3></eb3>": "<td>\u2028\u2028</td>",
 11 |     "<eb4></eb4>": "<td><sup> </sup></td>",
 12 |     "<eb5></eb5>": "<td><b></b></td>",
 13 |     "<eb6></eb6>": "<td><i> </i></td>",
 14 |     "<eb7></eb7>": "<td><b><i></i></b></td>",
 15 |     "<eb8></eb8>": "<td><b><i> </i></b></td>",
 16 |     "<eb9></eb9>": "<td><i></i></td>",
 17 |     "<eb10></eb10>": "<td><b> \u2028 \u2028 </b></td>",
 18 | }
 19 | 
 20 | revisions = {
 21 |     "^.*$": eb_tokens,
 22 |     "<thead>(.*?)</thead>": {
 23 |         r'(<td( [a-z]+="(\d)+")*?>)(.*?)</td>': r"\g<1><b>\g<4></b></td>",
 24 |         "<b></b>": "",
 25 |         "<b><b>": "<b>",
 26 |         "</b></b>": "</b>",
 27 |     },
 28 | }
 29 | 
 30 | cell_tokens = ["<td></td>", "<td", *eb_tokens]
 31 | 
 32 | gca = ["GCA"]
 33 | gcb = dict(ratio=0.0625, heads=1)
 34 | 
 35 | model = dict(
 36 |     type="TableScanner",
 37 |     backbone=dict(
 38 |         type="TableResNet",
 39 |         dim=3,
 40 |         out=512,
 41 |         gcb1=dict(depth=1, **gcb),
 42 |         gcb2=dict(depth=2, **gcb, gca=gca),
 43 |         gcb3=dict(depth=5, **gcb, gca=gca),
 44 |         gcb4=dict(depth=3, **gcb, gca=gca),
 45 |     ),
 46 |     encoder=dict(
 47 |         type="PositionalEncoding2D",
 48 |         channels=512,
 49 |     ),
 50 |     decoder=dict(
 51 |         type="TableDecoder",
 52 |         html_decoder=dict(
 53 |             blocks=[
 54 |                 dict(
 55 |                     att1=dict(type="WindowAttention"),
 56 |                     att2=dict(type="GlobalAttention"),
 57 |                 ),
 58 |                 dict(
 59 |                     att1=dict(type="WindowAttention"),
 60 |                     att2=dict(type="GlobalAttention"),
 61 |                 ),
 62 |                 dict(
 63 |                     att1=dict(type="WindowAttention"),
 64 |                     att2=dict(type="GlobalAttention"),
 65 |                 ),
 66 |             ],
 67 |         ),
 68 |         cell_decoder=dict(
 69 |             blocks=[
 70 |                 dict(
 71 |                     att1=dict(type="WindowAttention"),
 72 |                     att2=dict(type="GlobalAttention"),
 73 |                 ),
 74 |             ],
 75 |         ),
 76 |         html_fetcher={},
 77 |         heads=8,
 78 |         window=300,
 79 |         d_model=512,
 80 |         dropout=0.2,
 81 |         max_len_html=max_len_html,
 82 |         max_len_cell=max_len_cell,
 83 |     ),
 84 |     html_loss=[
 85 |         dict(type="CELoss", key="html"),
 86 |         dict(type="CELoss", key="back"),
 87 |         dict(type="KLLoss", key="html", rev="back"),
 88 |         dict(type="BBLoss"),
 89 |     ],
 90 |     cell_loss=[
 91 |         dict(type="CELoss", key="cell"),
 92 |     ],
 93 |     handler=dict(
 94 |         type="TableHandler",
 95 |         html_dict_file="alphabet/pubtabnet/structure_alphabet.txt",
 96 |         cell_dict_file="alphabet/pubtabnet/character_alphabet.txt",
 97 |         SOC=["<td></td>", "<td"],
 98 |         EOC=["<td></td>", "</td>"],
 99 |         revisor=dict(
100 |             template="{}",
101 |             patterns=revisions,
102 |         ),
103 |     ),
104 | )
105 | 
106 | train_pipeline = [
107 |     dict(type="LoadImageFromFile"),
108 |     dict(type="TableResize", size=520),
109 |     dict(
110 |         type="TablePad",
111 |         size=(520, 520),
112 |     ),
113 |     dict(type="TableBboxEncode"),
114 |     dict(type="ToTensorOCR"),
115 |     dict(
116 |         type="NormalizeOCR",
117 |         mean=[0.5, 0.5, 0.5],
118 |         std=[0.5, 0.5, 0.5],
119 |     ),
120 |     dict(
121 |         type="Collect",
122 |         keys=["img"],
123 |         meta_keys=[
124 |             "filename",
125 |             "ori_shape",
126 |             "img_shape",
127 |             "pad_shape",
128 |             "img_scale",
129 |             "html",
130 |             "cell",
131 |             "bbox",
132 |         ],
133 |     ),
134 | ]
135 | 
136 | test_pipeline = [
137 |     dict(type="LoadImageFromFile"),
138 |     dict(type="TableResize", size=520),
139 |     dict(
140 |         type="TablePad",
141 |         size=(520, 520),
142 |     ),
143 |     dict(type="ToTensorOCR"),
144 |     dict(
145 |         type="NormalizeOCR",
146 |         mean=[0.5, 0.5, 0.5],
147 |         std=[0.5, 0.5, 0.5],
148 |     ),
149 |     dict(
150 |         type="Collect",
151 |         keys=["img"],
152 |         meta_keys=[
153 |             "filename",
154 |             "ori_shape",
155 |             "img_shape",
156 |             "pad_shape",
157 |             "img_scale",
158 |         ],
159 |     ),
160 | ]
161 | 
162 | loader = dict(
163 |     type="TableHardDiskLoader",
164 |     max_len_html=max_len_html,
165 |     parser=dict(
166 |         type="TableStrParser",
167 |         cell_tokens=cell_tokens,
168 |     ),
169 | )
170 | 
171 | data = dict(
172 |     samples_per_gpu=2,
173 |     workers_per_gpu=2,
174 |     train=dict(
175 |         type="TableDataset",
176 |         img_prefix="../data/pubtabnet/train/",
177 |         ann_file="../data/mmocr_pubtabnet/train/",
178 |         pipeline=train_pipeline,
179 |         loader=loader,
180 |         test_mode=False,
181 |     ),
182 |     val=dict(
183 |         type="TableDataset",
184 |         img_prefix="../data/pubtabnet/val/",
185 |         ann_file="../data/mmocr_pubtabsub/val/",
186 |         pipeline=train_pipeline,
187 |         loader=loader,
188 |         test_mode=True,
189 |     ),
190 |     test=dict(
191 |         type="TableDataset",
192 |         img_prefix="../data/pubtabnet/val/",
193 |         ann_file="../data/mmocr_pubtabsub/val/",
194 |         pipeline=test_pipeline,
195 |         loader=loader,
196 |         test_mode=True,
197 |     ),
198 | )
199 | 
200 | # optimizer
201 | optimizer = dict(type="Ranger", lr=1e-3)
202 | optimizer_config = dict(grad_clip=dict(max_norm=30, norm_type=2))
203 | 
204 | # learning policy
205 | lr_config = dict(
206 |     policy="step",
207 |     warmup="linear",
208 |     warmup_iters=50,
209 |     warmup_ratio=1.0 / 3,
210 |     step=[25, 28],
211 | )
212 | 
213 | # runner
214 | runner = dict(type="EpochBasedRunner", max_epochs=30)
215 | 
216 | # evaluation
217 | ignore = None
218 | evaluation = dict(interval=1, metric="acc")
219 | 
220 | # fp16
221 | fp16 = dict(loss_scale="dynamic")
222 | 
223 | # checkpoint setting
224 | checkpoint_config = dict(interval=1)
225 | 
226 | # log_config
227 | log_config = dict(interval=100, hooks=[dict(type="TextLoggerHook")])
228 | 
229 | # logger
230 | log_level = "INFO"
231 | 
232 | # yapf:enable
233 | dist_params = dict(backend="nccl")
234 | 
235 | # pretrained
236 | load_from = None
237 | resume_from = None
238 | 
239 | # workflow
240 | workflow = [("train", 1)]
241 | 


--------------------------------------------------------------------------------
/datasets/FinTabNet.yaml:
--------------------------------------------------------------------------------
 1 | type: FinTabNet
 2 | load:
 3 |   dir: ~/data/fintabnet/img_tables/
 4 |   jsonl:
 5 |   - ~/data/fintabnet/img_tables/FinTabNet_1.0.0_table_train.jsonl
 6 |   - ~/data/fintabnet/img_tables/FinTabNet_1.0.0_table_val.jsonl
 7 |   - ~/data/fintabnet/img_tables/FinTabNet_1.0.0_table_test.jsonl
 8 | dump:
 9 |   dir: ~/data/mmocr_fintabnet/
10 |   json: ~/data/ground_truth_fintabnet.json
11 |   split: val
12 | replace:
13 |   []: "<eb></eb>"
14 |   [' ']: "<eb></eb>"
15 |   ['<b>', ' ', '</b>']: "<eb></eb>"
16 |   ['<sup>', ' ', '</sup>']: "<eb></eb>"
17 |   ['<i>', ' ', '</i>']: "<eb></eb>"
18 |   ['<i>', '</i>']: "<eb></eb>"
19 |   ['<sup>', '</sup>']: "<eb></eb>"
20 |   [' ', '<i>', '</i>']: "<eb></eb>"
21 |   [' ', '<sup>', '</sup>']: "<eb></eb>"
22 |   [' ', ' ']: "<eb></eb>"
23 |   [' ', '<i>', '</i>', '<i>', '</i>', ' ', '<i>', '</i>']: "<eb></eb>"
24 |   [' ', '<i>', '</i>', ' ']: "<eb></eb>"
25 |   [' ', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', ' ', '</sup>', ' ']: "<eb></eb>"
26 |   ['<sup>', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
27 |   [' ', ' ', ' ']: "<eb></eb>"
28 |   [' ', ' ', '<i>', '<sup>', '</sup>', '</i>', ' ']: "<eb></eb>"
29 |   ['<i>', '<sup>', '</sup>', '</i>']: "<eb></eb>"
30 |   ['<i>', '</i>', '<i>', '</i>']: "<eb></eb>"
31 |   [' ', '<i>', '</i>', '<i>', '</i>', '<i>', '</i>']: "<eb></eb>"
32 |   [' ', '<sup>', ' ', '</sup>']: "<eb></eb>"
33 |   ['<sup>', ' ', '</sup>', '<sup>', ' ', '</sup>']: "<eb></eb>"
34 |   ['<sup>', ' ', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
35 |   ['<sup>', '<i>', '</i>', '</sup>']: "<eb></eb>"
36 |   [' ', ' ', ' ', ' ', '<i>', '</i>', ' ']: "<eb></eb>"
37 |   ['<i>', '</i>', '<i>', '</i>', '<i>', '</i>', '<i>', '</i>']: "<eb></eb>"
38 |   [' ', ' ', ' ', ' ']: "<eb></eb>"
39 |   [' ', ' ', ' ', ' ', ' ']: "<eb></eb>"
40 |   [' ', ' ', ' ', ' ', ' ', ' ']: "<eb></eb>"
41 |   ['<i>', '</i>', ' ', '<i>', '</i>', '<i>', '</i>']: "<eb></eb>"
42 |   [' ', ' ', '<i>', '</i>']: "<eb></eb>"
43 |   [' ', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', ' ', ' ', '</sup>', ' ']: "<eb></eb>"
44 |   [' ', '<sup>', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
45 |   ['<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
46 |   [' ', ' ', ' ', '<i>', '</i>']: "<eb></eb>"
47 |   [' ', ' ', ' ', '<i>', '</i>', ' ']: "<eb></eb>"
48 |   [' ', '<i>', '</i>', ' ', ' ', '<i>', '</i>']: "<eb></eb>"
49 |   ['<i>', '</i>', ' ', '<i>', '</i>']: "<eb></eb>"
50 |   [' ', ' ', ' ', ' ', '<i>', '</i>']: "<eb></eb>"
51 |   [' ', ' ', ' ', '<sup>', '</sup>']: "<eb></eb>"
52 |   [' ', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
53 |   ['<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
54 |   [' ', '<i>', ' ', '</i>']: "<eb></eb>"
55 |   ['<sub>', '</sub>', '<sub>', '</sub>', '<sup>', '</sup>', ' ']: "<eb></eb>"
56 |   ['<sub>', '</sub>', '<sup>', '</sup>', ' ']: "<eb></eb>"
57 |   ['<sup>', '</sup>', ' ']: "<eb></eb>"
58 |   [' ', ' ', '<i>', '</i>', ' ']: "<eb></eb>"
59 |   [' ', ' ', '<sup>', '</sup>']: "<eb></eb>"
60 | 


--------------------------------------------------------------------------------
/datasets/FinTabSub.yaml:
--------------------------------------------------------------------------------
 1 | type: FinTabNet
 2 | load:
 3 |   dir: ~/data/fintabnet/img_tables/
 4 |   jsonl:
 5 |   - ~/data/fintabnet/img_tables/FinTabNet_1.0.0_table_train.jsonl
 6 |   - ~/data/fintabnet/img_tables/FinTabNet_1.0.0_table_val.jsonl
 7 |   - ~/data/fintabnet/img_tables/FinTabNet_1.0.0_table_test.jsonl
 8 | dump:
 9 |   dir: ~/data/mmocr_fintabsub/
10 |   json: ~/data/ground_truth_fintabsub.json
11 |   split: val
12 | samples: 64
13 | replace:
14 |   []: "<eb></eb>"
15 |   [' ']: "<eb></eb>"
16 |   ['<b>', ' ', '</b>']: "<eb></eb>"
17 |   ['<sup>', ' ', '</sup>']: "<eb></eb>"
18 |   ['<i>', ' ', '</i>']: "<eb></eb>"
19 |   ['<i>', '</i>']: "<eb></eb>"
20 |   ['<sup>', '</sup>']: "<eb></eb>"
21 |   [' ', '<i>', '</i>']: "<eb></eb>"
22 |   [' ', '<sup>', '</sup>']: "<eb></eb>"
23 |   [' ', ' ']: "<eb></eb>"
24 |   [' ', '<i>', '</i>', '<i>', '</i>', ' ', '<i>', '</i>']: "<eb></eb>"
25 |   [' ', '<i>', '</i>', ' ']: "<eb></eb>"
26 |   [' ', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', ' ', '</sup>', ' ']: "<eb></eb>"
27 |   ['<sup>', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
28 |   [' ', ' ', ' ']: "<eb></eb>"
29 |   [' ', ' ', '<i>', '<sup>', '</sup>', '</i>', ' ']: "<eb></eb>"
30 |   ['<i>', '<sup>', '</sup>', '</i>']: "<eb></eb>"
31 |   ['<i>', '</i>', '<i>', '</i>']: "<eb></eb>"
32 |   [' ', '<i>', '</i>', '<i>', '</i>', '<i>', '</i>']: "<eb></eb>"
33 |   [' ', '<sup>', ' ', '</sup>']: "<eb></eb>"
34 |   ['<sup>', ' ', '</sup>', '<sup>', ' ', '</sup>']: "<eb></eb>"
35 |   ['<sup>', ' ', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
36 |   ['<sup>', '<i>', '</i>', '</sup>']: "<eb></eb>"
37 |   [' ', ' ', ' ', ' ', '<i>', '</i>', ' ']: "<eb></eb>"
38 |   ['<i>', '</i>', '<i>', '</i>', '<i>', '</i>', '<i>', '</i>']: "<eb></eb>"
39 |   [' ', ' ', ' ', ' ']: "<eb></eb>"
40 |   [' ', ' ', ' ', ' ', ' ']: "<eb></eb>"
41 |   [' ', ' ', ' ', ' ', ' ', ' ']: "<eb></eb>"
42 |   ['<i>', '</i>', ' ', '<i>', '</i>', '<i>', '</i>']: "<eb></eb>"
43 |   [' ', ' ', '<i>', '</i>']: "<eb></eb>"
44 |   [' ', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', ' ', ' ', '</sup>', ' ']: "<eb></eb>"
45 |   [' ', '<sup>', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
46 |   ['<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
47 |   [' ', ' ', ' ', '<i>', '</i>']: "<eb></eb>"
48 |   [' ', ' ', ' ', '<i>', '</i>', ' ']: "<eb></eb>"
49 |   [' ', '<i>', '</i>', ' ', ' ', '<i>', '</i>']: "<eb></eb>"
50 |   ['<i>', '</i>', ' ', '<i>', '</i>']: "<eb></eb>"
51 |   [' ', ' ', ' ', ' ', '<i>', '</i>']: "<eb></eb>"
52 |   [' ', ' ', ' ', '<sup>', '</sup>']: "<eb></eb>"
53 |   [' ', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
54 |   ['<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>', '<sup>', '</sup>']: "<eb></eb>"
55 |   [' ', '<i>', ' ', '</i>']: "<eb></eb>"
56 |   ['<sub>', '</sub>', '<sub>', '</sub>', '<sup>', '</sup>', ' ']: "<eb></eb>"
57 |   ['<sub>', '</sub>', '<sup>', '</sup>', ' ']: "<eb></eb>"
58 |   ['<sup>', '</sup>', ' ']: "<eb></eb>"
59 |   [' ', ' ', '<i>', '</i>', ' ']: "<eb></eb>"
60 |   [' ', ' ', '<sup>', '</sup>']: "<eb></eb>"
61 | 


--------------------------------------------------------------------------------
/datasets/PubTab250.yaml:
--------------------------------------------------------------------------------
 1 | type: PubTabNet
 2 | load:
 3 |   dir: ~/data/pubtabnet/
 4 |   jsonl:
 5 |   - ~/data/pubtabnet/PubTabNet_2.0.0.jsonl
 6 | dump:
 7 |   dir: ~/data/mmocr_pubtab250/
 8 |   json: ~/data/ground_truth_pubtab250.json
 9 |   split: val
10 | seq_len:
11 |   html:
12 |     min: 250
13 | replace:
14 |   []: "<eb></eb>"
15 |   [' ']: "<eb1></eb1>"
16 |   ['<b>', ' ', '</b>']: "<eb2></eb2>"
17 |   ["\u2028", "\u2028"]: "<eb3></eb3>"
18 |   ['<sup>', ' ', '</sup>']: "<eb4></eb4>"
19 |   ['<b>', '</b>']: "<eb5></eb5>"
20 |   ['<i>', ' ', '</i>']: "<eb6></eb6>"
21 |   ['<b>', '<i>', '</i>', '</b>']: "<eb7></eb7>"
22 |   ['<b>', '<i>', ' ', '</i>', '</b>']: "<eb8></eb8>"
23 |   ['<i>', '</i>']: "<eb9></eb9>"
24 |   ['<b>', ' ', "\u2028", ' ', "\u2028", ' ', '</b>']: "<eb10></eb10>"
25 | 


--------------------------------------------------------------------------------
/datasets/PubTabNet.yaml:
--------------------------------------------------------------------------------
 1 | type: PubTabNet
 2 | load:
 3 |   dir: ~/data/pubtabnet/
 4 |   jsonl:
 5 |   - ~/data/pubtabnet/PubTabNet_2.0.0.jsonl
 6 | dump:
 7 |   dir: ~/data/mmocr_pubtabnet/
 8 |   json: ~/data/ground_truth_pubtabnet.json
 9 |   split: val
10 | replace:
11 |   []: "<eb></eb>"
12 |   [' ']: "<eb1></eb1>"
13 |   ['<b>', ' ', '</b>']: "<eb2></eb2>"
14 |   ["\u2028", "\u2028"]: "<eb3></eb3>"
15 |   ['<sup>', ' ', '</sup>']: "<eb4></eb4>"
16 |   ['<b>', '</b>']: "<eb5></eb5>"
17 |   ['<i>', ' ', '</i>']: "<eb6></eb6>"
18 |   ['<b>', '<i>', '</i>', '</b>']: "<eb7></eb7>"
19 |   ['<b>', '<i>', ' ', '</i>', '</b>']: "<eb8></eb8>"
20 |   ['<i>', '</i>']: "<eb9></eb9>"
21 |   ['<b>', ' ', "\u2028", ' ', "\u2028", ' ', '</b>']: "<eb10></eb10>"
22 | 


--------------------------------------------------------------------------------
/datasets/PubTabSub.yaml:
--------------------------------------------------------------------------------
 1 | type: PubTabNet
 2 | load:
 3 |   dir: ~/data/pubtabnet/
 4 |   jsonl:
 5 |   - ~/data/pubtabnet/PubTabNet_2.0.0.jsonl
 6 | dump:
 7 |   dir: ~/data/mmocr_pubtabsub/
 8 |   json: ~/data/ground_truth_pubtabsub.json
 9 |   split: val
10 | seq_len:
11 |   html:
12 |     min: 250
13 | samples: 64
14 | replace:
15 |   []: "<eb></eb>"
16 |   [' ']: "<eb1></eb1>"
17 |   ['<b>', ' ', '</b>']: "<eb2></eb2>"
18 |   ["\u2028", "\u2028"]: "<eb3></eb3>"
19 |   ['<sup>', ' ', '</sup>']: "<eb4></eb4>"
20 |   ['<b>', '</b>']: "<eb5></eb5>"
21 |   ['<i>', ' ', '</i>']: "<eb6></eb6>"
22 |   ['<b>', '<i>', '</i>', '</b>']: "<eb7></eb7>"
23 |   ['<b>', '<i>', ' ', '</i>', '</b>']: "<eb8></eb8>"
24 |   ['<i>', '</i>']: "<eb9></eb9>"
25 |   ['<b>', ' ', "\u2028", ' ', "\u2028", ' ', '</b>']: "<eb10></eb10>"
26 | 


--------------------------------------------------------------------------------
/mutab/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | 


--------------------------------------------------------------------------------
/mutab/apis/__init__.py:
--------------------------------------------------------------------------------
1 | from mutab import datasets, models, optimizer
2 | 
3 | from .test import evaluate
4 | from .train import train
5 | 
6 | __all__ = ["datasets", "models", "optimizer", "evaluate", "train"]
7 | 


--------------------------------------------------------------------------------
/mutab/apis/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from concurrent.futures import ProcessPoolExecutor
 3 | from functools import partial
 4 | from typing import List, Optional
 5 | 
 6 | from mmdet.apis import init_detector
 7 | from tqdm import tqdm
 8 | 
 9 | from mutab.metrics import TEDS
10 | 
11 | 
12 | def score(item, truth, ignore: Optional[List[str]]):
13 |     teds_full = TEDS(ignore, struct_only=False)
14 |     teds_html = TEDS(ignore, struct_only=True)
15 |     file_name = os.path.basename(item["path"])
16 |     if file_name not in truth:
17 |         return None
18 |     item.update(real=truth[file_name]["html"])
19 |     item.update(type=truth[file_name]["type"])
20 |     scores = {}
21 |     scores.update(full=teds_full.evaluate(**item))
22 |     scores.update(html=teds_html.evaluate(**item))
23 |     item.update(TEDS=scores)
24 |     return (file_name, item)
25 | 
26 | 
27 | def worker(n: int, paths: List[str], cfg: str, ckpt: str, truth):
28 |     model = init_detector(config=cfg, checkpoint=ckpt, device=n)
29 |     items = map(model.predict, tqdm(list(paths), disable=n > 0))
30 |     final = partial(score, truth=truth, ignore=model.cfg.ignore)
31 |     with ProcessPoolExecutor() as pool:
32 |         return list(pool.map(final, items))
33 | 
34 | 
35 | def evaluate(paths: List[List[str]], cfg: str, ckpt: str, truth):
36 |     with ProcessPoolExecutor(len(paths)) as pool:
37 |         process = partial(worker, cfg=cfg, ckpt=ckpt, truth=truth)
38 |         results = list(pool.map(process, *zip(*enumerate(paths))))
39 |         return dict(filter(None, sum(results, [])))
40 | 


--------------------------------------------------------------------------------
/mutab/apis/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime as dt
 3 | 
 4 | from mmcv import Config, mkdir_or_exist
 5 | from mmcv.runner import get_dist_info
 6 | from mmdet.apis import train_detector
 7 | from mmdet.utils import get_device
 8 | 
 9 | from mutab.datasets import build_dataset
10 | from mutab.models import build_detector
11 | from mutab.utils import collect_env, get_logger, pretty_env
12 | 
13 | 
14 | def train(cfg: Config, cfg_file: str):
15 |     mkdir_or_exist(cfg.work_dir)
16 |     _, devices = get_dist_info()
17 |     cfg.gpu_ids = range(devices)
18 |     cfg.device = get_device()
19 | 
20 |     # prepare log
21 |     time = dt.now().strftime("%Y%m%d_%H%M%S")
22 |     log = os.path.join(cfg.work_dir, "{}.log".format(time))
23 |     log = get_logger(log_file=log, log_level=cfg.log_level)
24 |     metas = dict(env=collect_env(), config=cfg.pretty_text)
25 | 
26 |     # dump environmental information
27 |     log.info(pretty_env(bar="-" * 64))
28 |     log.info("\n{}".format(cfg.pretty_text))
29 | 
30 |     # build model and dataset
31 |     model = build_detector(cfg.model)
32 |     dataset = build_dataset(cfg.data.train)
33 | 
34 |     # dump configuration
35 |     os.environ.update(LOCAL_RANK=os.getenv("LOCAL_RANK", "0"))
36 |     cfg.dump(os.path.join(cfg.work_dir, os.path.basename(cfg_file)))
37 | 
38 |     # start training
39 |     cfg.checkpoint_config.meta = dict(env=collect_env(), CLASSES=int(dataset.CLASSES))
40 |     train_detector(model, dataset, cfg, devices > 1, True, timestamp=time, meta=metas)
41 | 


--------------------------------------------------------------------------------
/mutab/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from mmdet.datasets.builder import build_dataset
 2 | from mmocr.datasets.pipelines import NormalizeOCR, ResizeOCR, ToTensorOCR
 3 | 
 4 | from .dataset import TableDataset
 5 | from .loader import TableHardDiskLoader, TableStrParser
 6 | from .pipeline import TableBboxEncode, TablePad, TableResize
 7 | 
 8 | __all__ = [
 9 |     "NormalizeOCR",
10 |     "ResizeOCR",
11 |     "TableBboxEncode",
12 |     "TableDataset",
13 |     "TableHardDiskLoader",
14 |     "TablePad",
15 |     "TableResize",
16 |     "TableStrParser",
17 |     "ToTensorOCR",
18 |     "build_dataset",
19 | ]
20 | 


--------------------------------------------------------------------------------
/mutab/datasets/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from mmdet.datasets.builder import DATASETS
 3 | from mmocr.datasets import BaseDataset
 4 | 
 5 | from mutab.metrics import TEDS
 6 | from mutab.utils import get_logger
 7 | 
 8 | 
 9 | @DATASETS.register_module()
10 | class TableDataset(BaseDataset):
11 |     def evaluate(self, results, **kwargs):
12 |         metric = TEDS(struct_only=False)
13 |         scores = []
14 |         logger = get_logger()
15 |         for idx, info in enumerate(self.data_infos):
16 |             score = metric.evaluate(**results[idx])
17 |             logger.info("%s score: %s", info["filename"], score)
18 |             scores.append(score)
19 | 
20 |         return dict(TEDS=np.mean(scores))
21 | 


--------------------------------------------------------------------------------
/mutab/datasets/loader.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | 
 4 | import numpy as np
 5 | from mmocr.datasets.builder import LOADERS, PARSERS, build_parser
 6 | 
 7 | from mutab.utils import get_logger
 8 | 
 9 | 
10 | @PARSERS.register_module()
11 | class TableStrParser:
12 |     def __init__(self, cell_tokens):
13 |         assert isinstance(cell_tokens, list)
14 |         assert len(cell_tokens)
15 |         self.cell_tokens = cell_tokens
16 | 
17 |     def align(self, html, bbox, **info):
18 |         queue = iter(bbox)
19 |         boxes = np.zeros((len(html), 4))
20 |         for idx, cell in enumerate(html):
21 |             if cell in self.cell_tokens:
22 |                 boxes[idx] = next(queue)
23 |         return dict(html=html, bbox=boxes, **info)
24 | 
25 |     def __call__(self, info):
26 |         return self.align(**info)
27 | 
28 | 
29 | @LOADERS.register_module()
30 | class TableHardDiskLoader:
31 |     def __init__(self, parser: dict, ann_file: str, max_len_html: int):
32 |         self.parser = build_parser(parser)
33 |         self.infos = self.load(ann_file, max_len_html)
34 | 
35 |     def __len__(self):
36 |         return len(self.infos)
37 | 
38 |     def __getitem__(self, index):
39 |         return self.parser(self.infos[index])
40 | 
41 |     def __iter__(self):
42 |         self.idx = 0
43 |         return self
44 | 
45 |     def __next__(self):
46 |         if self.idx < len(self):
47 |             data = self[self.idx]
48 |             self.idx += 1
49 |             return data
50 |         raise StopIteration
51 | 
52 |     def load(self, ann_file: str, max_len_html: int):
53 |         data = []
54 |         logger = get_logger()
55 |         logger.info(f"Loading {ann_file} ...")
56 |         for f in glob.glob(os.path.join(ann_file, "*.txt")):
57 |             with open(f) as f:
58 |                 data.append(self.parse(f))
59 |         logger.info(f"{len(data)} tables were loaded from {ann_file}")
60 |         return list(v for v in data if len(v["html"]) <= max_len_html)
61 | 
62 |     def parse(self, f):
63 |         path = f.readline().strip()
64 |         html = f.readline().strip().split(",")
65 |         bbox_list = []
66 |         cell_list = []
67 |         for value in f.readlines():
68 |             bbox, cell = value.strip().split("<;>")
69 |             bbox = tuple(map(int, bbox.split(",")))
70 |             bbox_list.append(bbox)
71 |             if bbox != (0, 0, 0, 0):
72 |                 cell_list.append(cell.split("\t"))
73 |         return dict(filename=path, html=html, cell=cell_list, bbox=bbox_list)
74 | 


--------------------------------------------------------------------------------
/mutab/datasets/pipeline.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from mmdet.datasets.builder import PIPELINES
  4 | 
  5 | 
  6 | @PIPELINES.register_module()
  7 | class TableResize:
  8 |     def __init__(self, size: int):
  9 |         self.size = size
 10 | 
 11 |     def __call__(self, results):
 12 |         self.resize_img(results)
 13 |         self.resize_box(results)
 14 |         return results
 15 | 
 16 |     def resize_img(self, results):
 17 |         image = results["img"]
 18 |         h, w, _ = image.shape
 19 |         if w < h:
 20 |             w = int(self.size / h * w)
 21 |             h = int(self.size)
 22 |         else:
 23 |             h = int(self.size / w * h)
 24 |             w = int(self.size)
 25 |         scale = (h / image.shape[0], w / image.shape[1])
 26 |         image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LINEAR)
 27 |         results.update(img=image, img_shape=image.shape, img_scale=scale)
 28 | 
 29 |     def resize_box(self, results):
 30 |         h, w = results["img_shape"][:2]
 31 |         info = results.get("img_info")
 32 |         # train and val phase
 33 |         if info is not None and info.get("bbox") is not None:
 34 |             bbox = info["bbox"]
 35 |             y, x = results["img_scale"]
 36 |             bbox[..., 0::2] = np.clip(bbox[..., 0::2] * x, 0, w - 1)
 37 |             bbox[..., 1::2] = np.clip(bbox[..., 1::2] * y, 0, h - 1)
 38 |             info.update(bbox=bbox)
 39 | 
 40 | 
 41 | @PIPELINES.register_module()
 42 | class TablePad:
 43 |     def __init__(self, size):
 44 |         self.size = size[::-1]
 45 | 
 46 |     def __call__(self, results):
 47 |         img = self.extend(results["img"], self.size)
 48 |         results.update(img=img, pad_shape=img.shape)
 49 |         return results
 50 | 
 51 |     def extend(self, img, size):
 52 |         h = (0, size[0] - img.shape[0])
 53 |         w = (0, size[1] - img.shape[1])
 54 |         pad = np.pad(img, (h, w, (0, 0)))
 55 |         return pad.astype(img.dtype)
 56 | 
 57 | 
 58 | @PIPELINES.register_module()
 59 | class TableBboxFlip:
 60 |     def __call__(self, results):
 61 |         h, _, _ = results["img_shape"]
 62 |         bbox = results["img_info"].get("bbox", results.get("bbox"))
 63 |         mask = np.count_nonzero(bbox, axis=-1, keepdims=True)
 64 |         flip = np.where(mask, h - 1 - bbox, bbox).clip(min=0)
 65 |         np.copyto(bbox[..., 1], flip[..., 1])
 66 |         np.copyto(bbox[..., 3], flip[..., 3])
 67 |         return results
 68 | 
 69 | 
 70 | @PIPELINES.register_module()
 71 | class TableBboxEncode:
 72 |     def __call__(self, results):
 73 |         info = results["img_info"]
 74 |         size = results["img"].shape
 75 |         bbox = self.xyxy_to_xywh(info["bbox"])
 76 |         bbox = self.normalize_bbox(bbox, size)
 77 |         assert np.all(bbox >= 0)
 78 |         assert np.all(bbox <= 1)
 79 |         info.update(bbox=bbox)
 80 |         self.adjust_key(results)
 81 |         return results
 82 | 
 83 |     def xyxy_to_xywh(self, bbox):
 84 |         bb = np.empty_like(bbox)
 85 |         # xy center
 86 |         bb[..., 0] = bbox[..., 0::2].mean(axis=-1)
 87 |         bb[..., 1] = bbox[..., 1::2].mean(axis=-1)
 88 |         # width and height
 89 |         bb[..., 2] = bbox[..., 0::2].ptp(axis=-1)
 90 |         bb[..., 3] = bbox[..., 1::2].ptp(axis=-1)
 91 |         return bb
 92 | 
 93 |     def normalize_bbox(self, bbox, size):
 94 |         bbox[..., 0::2] /= size[1]
 95 |         bbox[..., 1::2] /= size[0]
 96 |         return bbox
 97 | 
 98 |     def adjust_key(self, results):
 99 |         results.update(html=results["img_info"].pop("html"))
100 |         results.update(cell=results["img_info"].pop("cell"))
101 |         results.update(bbox=results["img_info"].pop("bbox"))
102 | 


--------------------------------------------------------------------------------
/mutab/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .metric import TEDS
2 | 
3 | __all__ = ["TEDS"]
4 | 


--------------------------------------------------------------------------------
/mutab/metrics/metric.py:
--------------------------------------------------------------------------------
  1 | import distance
  2 | from apted import APTED, Config
  3 | from apted.helpers import Tree
  4 | from lxml import etree, html
  5 | 
  6 | 
  7 | class TableTree(Tree):
  8 |     def __init__(self, tag, col=None, row=None, txt=None, *sub):
  9 |         super().__init__(tag, *sub)
 10 |         self.tag = tag
 11 |         self.col = col
 12 |         self.row = row
 13 |         self.txt = txt
 14 | 
 15 |     def bracket(self):
 16 |         if self.tag == "td":
 17 |             values = dict(
 18 |                 tag=self.tag,
 19 |                 col=self.col,
 20 |                 row=self.row,
 21 |                 txt=self.txt,
 22 |             )
 23 |         else:
 24 |             values = dict(tag=self.tag)
 25 |         result = str(values)
 26 |         for child in self.children:
 27 |             result += child.bracket()
 28 |         return "{{{}}}".format(result)
 29 | 
 30 | 
 31 | class Custom(Config):
 32 |     @staticmethod
 33 |     def maximum(*seqs):
 34 |         return max(map(len, seqs))
 35 | 
 36 |     def normalized_distance(self, *seqs):
 37 |         return float(distance.levenshtein(*seqs)) / self.maximum(*seqs)
 38 | 
 39 |     def rename(self, node1, node2):
 40 |         if node1.tag != node2.tag:
 41 |             return 1.0
 42 |         if node1.col != node2.col:
 43 |             return 1.0
 44 |         if node1.row != node2.row:
 45 |             return 1.0
 46 |         if node1.tag == "td" and (node1.txt or node2.txt):
 47 |             return self.normalized_distance(node1.txt, node2.txt)
 48 |         return 0.0
 49 | 
 50 | 
 51 | class TEDS:
 52 |     def __init__(self, ignore_tags=None, struct_only=False):
 53 |         self.ignore_tags = ignore_tags
 54 |         self.struct_only = struct_only
 55 | 
 56 |     def tokenize(self, node, tokens):
 57 |         tokens.append("<%s>" % node.tag)
 58 |         if node.text is not None:
 59 |             tokens += list(node.text)
 60 |         for n in node.getchildren():
 61 |             self.tokenize(n, tokens)
 62 |         if node.tag != "unk":
 63 |             tokens.append("</%s>" % node.tag)
 64 |         if node.tag != "td" and node.tail is not None:
 65 |             tokens += list(node.tail)
 66 | 
 67 |     def load_html_tree(self, node, parent=None):
 68 |         if node.tag == "td":
 69 |             if self.struct_only:
 70 |                 cell = []
 71 |             else:
 72 |                 tokens = []
 73 |                 self.tokenize(node, tokens)
 74 |                 cell = tokens[1:-1].copy()
 75 |             col = int(node.attrib.get("colspan", "1"))
 76 |             row = int(node.attrib.get("rowspan", "1"))
 77 |             sub = TableTree(node.tag, col, row, cell)
 78 |         else:
 79 |             sub = TableTree(node.tag)
 80 |         if parent is not None:
 81 |             parent.children.append(sub)
 82 |         if node.tag != "td":
 83 |             for n in node.getchildren():
 84 |                 self.load_html_tree(n, sub)
 85 |         return sub
 86 | 
 87 |     def evaluate(self, pred, real, **kwargs):
 88 |         parser = html.HTMLParser(remove_comments=True, encoding="utf-8")
 89 |         pred = html.fromstring(pred, parser=parser)
 90 |         real = html.fromstring(real, parser=parser)
 91 |         pred = next(iter(pred.xpath("//table")), pred)
 92 |         real = next(iter(real.xpath("//table")), real)
 93 |         assert pred.tag in ("div", "table")
 94 |         assert real.tag in ("div", "table")
 95 |         if self.ignore_tags:
 96 |             etree.strip_tags(pred, *self.ignore_tags)
 97 |             etree.strip_tags(real, *self.ignore_tags)
 98 |         n_nodes_pred = len(pred.xpath(".//*"))
 99 |         n_nodes_real = len(real.xpath(".//*"))
100 |         pred = self.load_html_tree(pred)
101 |         real = self.load_html_tree(real)
102 |         result = APTED(pred, real, Custom()).compute_edit_distance()
103 |         result = 1 - float(result) / max(n_nodes_pred, n_nodes_real)
104 |         return result
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     import json
109 | 
110 |     with open("sample_pred.json") as fp:
111 |         pred_json = json.load(fp)
112 |     with open("sample_real.json") as fp:
113 |         real_json = json.load(fp)
114 |     with open("sample_test.json") as fp:
115 |         test_json = json.load(fp)
116 |     for key in pred_json:
117 |         pred = pred_json[key]
118 |         real = real_json[key]["html"]
119 |         teds = TEDS().evaluate(pred, real)
120 |         print(key, teds)
121 |         assert test_json[key] == teds
122 | 


--------------------------------------------------------------------------------
/mutab/metrics/sample_pred.json:
--------------------------------------------------------------------------------
1 | {"PMC2094709_004_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b>Week</b></td><td><b>Duration (min)</b></td><td><b>Intensity (% HRR)</b></td><td><b>Intensity (RPE)</b></td></tr></thead><tbody><tr><td>1</td><td>20</td><td>50 \u2013 60</td><td>9 \u2013 11</td></tr><tr><td>2</td><td>20</td><td>50 \u2013 60</td><td>9 \u2013 11</td></tr><tr><td>3 \u2013 5</td><td>25</td><td>60 \u2013 70</td><td>11</td></tr><tr><td>6 \u2013 8</td><td>30</td><td>60 \u2013 70</td><td>11</td></tr><tr><td>9 \u2013 11</td><td>30</td><td>70 \u2013 80</td><td>11 \u2013 13</td></tr><tr><td>12 \u2013 14</td><td>35</td><td>70 \u2013 80</td><td>11 \u2013 13</td></tr><tr><td>15 & 16</td><td>40</td><td>75 \u2013 85</td><td>13 \u2013 15</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC2871264_002_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b>Name of algorithm</b></td><td><b>Notable features</b></td></tr></thead><tbody><tr><td>MACS [23]</td><td>Uses both a control library and local statistics to minimize bias</td></tr><tr><td>SICER [14]</td><td>Designed for detecting diffusely enriched regions; for example, histone modification</td></tr><tr><td>PeakSeq [24]</td><td>Corrects for reference genome mappability and local statistics</td></tr><tr><td>SISSRs [25]</td><td>High resolution, precise identification of binding-site location</td></tr><tr><td>F-seq [26]</td><td>Uses kernel density estimation</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC2915972_003_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b> </b></td><td><b>No of patients</b></td></tr></thead><tbody><tr><td>Gender:</td><td> </td></tr><tr><td>Men</td><td>24</td></tr><tr><td>Women</td><td>26</td></tr><tr><td>Age (years):</td><td></td></tr><tr><td>30-39</td><td>2</td></tr><tr><td>40-49</td><td>8</td></tr><tr><td>50-59</td><td>15</td></tr><tr><td>60-69</td><td>16</td></tr><tr><td>70-79</td><td>6</td></tr><tr><td>\u2265 80</td><td>3</td></tr><tr><td>Tumor site:</td><td></td></tr><tr><td>Bladder</td><td>4</td></tr><tr><td>Breast</td><td>10</td></tr><tr><td>Colorectal</td><td>4</td></tr><tr><td>Exophageal</td><td>9</td></tr><tr><td>Gynecological</td><td>7</td></tr><tr><td>Lung</td><td>6</td></tr><tr><td>Prostate</td><td>10</td></tr><tr><td>Length of interval between baseline and follow-up interview (median)</td><td></td></tr><tr><td>< 50 days</td><td>22</td></tr><tr><td>\u2265 50 days</td><td>28</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC3160368_005_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b>Methods (n-mers used)</b></td><td><b>Average Sensitivity of 5-fold cross validation (%)</b></td><td><b>Average Specificity of 5-fold cross validation (%)</b></td></tr></thead><tbody><tr><td>FDAFSA (hexamers)</td><td>84*</td><td>86*</td></tr><tr><td>PromMachine (tetramers)</td><td>86<sup>+</sup></td><td>81<sup>+</sup></td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC3568059_003_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b> </b></td><td colspan=\"3\"><b>Participants during the period;</b></td></tr><tr><td><b> </b></td><td><b>0 to 3 months</b></td><td><b>3 to 6 months</b></td><td><b>6 to 12 months</b></td></tr><tr><td><b>Characteristics</b></td><td><b>n=72</b></td><td><b>n=71</b></td><td><b>n=65</b></td></tr></thead><tbody><tr><td>Age, years, median (range)</td><td>73 (50\u201394)</td><td>73 (47\u201392)</td><td>73 (47\u201390)</td></tr><tr><td>Patients, n (%)</td><td> </td><td> </td><td> </td></tr><tr><td> Female</td><td>33 (46)</td><td>27 (38)</td><td>26 (40)</td></tr><tr><td> Male</td><td>39 (54)</td><td>44 (62)</td><td>39 (60)</td></tr><tr><td>Stroke classification (TOAST), n (%)</td><td> </td><td> </td><td> </td></tr><tr><td> Large vessel disease</td><td>17 (24)</td><td>18 (25)</td><td>17 (26)</td></tr><tr><td> Small vessel disease</td><td>21 (29)</td><td>21 (30)</td><td>17 (26)</td></tr><tr><td> Cardioembolic stroke</td><td>15 (21)</td><td>11 (15)</td><td>11 (17)</td></tr><tr><td> Cryptogenic stroke</td><td>13 (18)</td><td>14 (20)</td><td>12 (19)</td></tr><tr><td>Intracerebral haemorrhage</td><td>6 (8)</td><td>7 (10)</td><td>8 (12)</td></tr><tr><td>Side of feision, n (%)</td><td> </td><td> </td><td> </td></tr><tr><td> Right side lesion</td><td>35 (49)</td><td>32 (45)</td><td>28 (43)</td></tr><tr><td> Left side lesion</td><td>37 (51)</td><td>39 (53)</td><td>37 (57)</td></tr><tr><td>Hypertension</td><td>47 (65)</td><td>44 (62)</td><td>41 (63)</td></tr><tr><td>Diabetes mellitus</td><td>17 (24)</td><td>18 (25)</td><td>17 (26)</td></tr><tr><td>Results from clinical scales 1\u20137 days after stroke onset</td><td> </td><td> </td><td> </td></tr><tr><td>BBS median (range) (n)</td><td>35 (0\u201356) (n=71)</td><td>41 (0\u201356) (n=70)</td><td>41 (0\u201356) (n=46)</td></tr><tr><td>M-MAS UAS-IS median (range)</td><td>45 (12\u201355) (n=65)</td><td>47 (12\u201355) (n=65)</td><td>50 (16\u201355) (n=56)</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC3707453_006_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td colspan=\"3\"><b>Star Magnitude 1</b></td><td colspan=\"3\"><b>Star Magnitude 6</b></td><td rowspan=\"2\"><b>Saturation Charge [%]</b></td><td rowspan=\"2\"><b>Capacitanc e Linearity [%]</b></td></tr><tr><td></td><td><b>Noise (g)</b></td><td><b>SN at 10</b></td><td><b>Signal (g)</b></td><td><b>Noise (g)</b></td><td><b>SN at 10 No AD [d]</b></td></tr></thead><tbody><tr><td>121200</td><td>498</td><td>47</td><td>1882</td><td>358</td><td>10</td><td>50000</td><td>99.2</td></tr><tr><td>143960</td><td>426</td><td>50</td><td>1610</td><td>199</td><td>13</td><td>272232</td><td>98.6</td></tr><tr><td>155220</td><td>418</td><td>50</td><td>1713</td><td>147</td><td>19</td><td>197109</td><td>98.1</td></tr><tr><td>159950</td><td>418</td><td>50</td><td>1759</td><td>130</td><td>19</td><td>172018</td><td>97.8</td></tr><tr><td>162400</td><td>419</td><td>50</td><td>1784</td><td>122</td><td>19</td><td>159575</td><td>97.6</td></tr><tr><td>164550</td><td>420</td><td>50</td><td>180</td><td>115</td><td>19</td><td>149254</td><td>97.5</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC3765162_003_01.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b> </b></td><td colspan=\"3\"><b>Men (n = 359)</b></td><td colspan=\"3\"><b>Women (n = 412)</b></td></tr><tr><td><b> </b></td><td colspan=\"3\"><b>Metabolic syndrome</b></td><td colspan=\"3\"><b>Metabolic syndrome</b></td></tr><tr><td><b>Baseline characteristics</b></td><td><b>Yes (n = 163)</b></td><td><b>No (n = 196)</b></td><td><b>P-value</b></td><td><b>Yes (n = 96)</b></td><td><b>No (n = 316)</b></td><td><b>P value</b></td></tr></thead><tbody><tr><td><b>Age (years)*</b></td><td>61.86 (\u00b10.83)</td><td>60.32 (\u00b10.77)</td><td>0.17</td><td>64.96 (\u00b10.88)</td><td>58.52 (\u00b10.55)</td><td><0.001</td></tr><tr><td><b>Sitting Systolic BP (mmHg)*</b></td><td>141.34 (\u00b11.27)</td><td>132.26 (\u00b11.15)</td><td><0.001</td><td>151.82 (\u00b11.16)</td><td>137.4( (\u00b10.96)</td><td><0.001</td></tr><tr><td><b>Stitting Diastolic BP (mmHg)*</b></td><td>85.69 (\u00b10.77)</td><td>80.79 (\u00b10.73)</td><td><0.001</td><td>89.27 (\u00b10.92)</td><td>82.67 (\u00b10.51)</td><td><0.001</td></tr><tr><td><b>Antitypertensive Therapy (%)</b></td><td>50.9%</td><td>28.4%</td><td><0.001</td><td>60.4%</td><td>29.4%</td><td><0.001</td></tr><tr><td><b>Total Cholesterol (mmol/L)*</b></td><td>5.61 (\u00b10.08)</td><td>5.70 (\u00b10.08)</td><td>0.56</td><td>6.04 (\u00b10.1)</td><td>5.99 (\u00b10.06)</td><td>0.67</td></tr><tr><td><b>LDL cholesterol (mmol/L)*</b></td><td>3.44 (\u00b10.06)</td><td>3.49 (\u00b10.06)</td><td>0.52</td><td>3.58 (\u00b1 0.06)</td><td>3.54 (\u00b1 0.04)</td><td>0.66</td></tr><tr><td><b>HDL cholesterol (mmol/L)*</b></td><td>1.03 (\u00b10.63)</td><td>1.27 (\u00b10.02)</td><td><0.001</td><td>1.20 (\u00b1 0.02)</td><td>1.48 (\u00b10.016)</td><td><0.001</td></tr><tr><td><b>Triglycerides (mmol/L)*</b></td><td>2.10 (1.63; 2.64)</td><td>1.32 (0.98; 1.57)</td><td><0.001</td><td>2.15 (1.78; 2.83)</td><td>1.24 (0.97; 1.56)</td><td><0.001</td></tr><tr><td><b>Diabetes mellitus (%)</b></td><td>30.7%</td><td>6.3%</td><td><0.001</td><td>33.3%</td><td>2.3%</td><td><0.001</td></tr><tr><td><b>BMI (kg/m</b><sup><b>2</b></sup><b>)*</b></td><td>29.88 (\u00b10.35)</td><td>26.06 (\u00b10.2)</td><td><0.001</td><td>22.39 (\u00b10.47)</td><td>26.95 (\u00b10.25)</td><td><0.001</td></tr><tr><td><b>ApoA1 Ig/L*</b></td><td>1.29 (\u00b10.013)</td><td>1.40 (\u00b10.017)</td><td><0.001</td><td>1.44 (\u00b10.02)</td><td>1.55 (\u00b10.001)</td><td><0.001</td></tr><tr><td><b>ApoB (g/L)*</b></td><td>1.21 (\u00b10.02)</td><td>1.19 (\u00b10.02)</td><td>0.48</td><td>1.23 (\u00b10.02)</td><td>1.18 (\u00b10.014)</td><td>0.044</td></tr><tr><td><b>Homa index*</b></td><td>2.25(1.15; 4.18)</td><td>0.94(0.51; 1.8)</td><td><0.001</td><td>2.51 (1.67, 3.86)</td><td>1.14 (0.72; 1.7)</td><td><0.001</td></tr><tr><td><b>MITCoffean (mm)*</b></td><td>0.79 (\u00b10.15)</td><td>0.76 (\u00b10.12)</td><td>0.084</td><td>0.77 (\u00b10.16)</td><td>0.69 (\u00b10.13)</td><td><0.001</td></tr><tr><td><b>Sum of total plaque area (mm<sup>2</sup>)*</b></td><td>53 (25; 100)</td><td>42 (10/27)</td><td>0.002</td><td>16 (1; 44)</td><td>8 (1;32)</td><td>0.01</td></tr><tr><td><b>Sum of plaque area carotids (mm<sup>2</sup>)*</b></td><td>22 (1; 39)</td><td>12 (1; 27.5)</td><td>0.011</td><td>8.75 (1.25.75)</td><td>1 (1; 19)</td><td>0.013</td></tr><tr><td><b>Sum of plaque area femoral (mm<sup>3</sup>)*</b></td><td>33(10 6,0)</td><td>23(1, 49)</td><td>0.011</td><td>10 (-17.75)</td><td>1(1; 6)</td><td>0.012</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC3872294_001_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td></td><td><b>HC (<i>N</i> = 20)</b></td><td><b>FASD (<i>N</i> = 15)</b></td></tr></thead><tbody><tr><td>Age (years)</td><td>16.3 (2.1)</td><td>15.3 (2.1)</td></tr><tr><td>IQ</td><td>108 (15)*</td><td>80 (15)*</td></tr><tr><td>Male/female (%male)</td><td>12/8 (60%)</td><td>10/5 (67%)</td></tr><tr><td>FASD sub diagnosis</td><td>\u2013</td><td>8 FAS, 7 ARND</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC4196076_004_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b>miRNA</b></td><td><b>Change relative to controls</b></td><td><b>Direction of regulation</b></td><td><b>Chromosome</b></td><td><b>miRNA</b></td><td><b>Change relative to controls</b></td><td><b>Direction of regulation</b></td><td><b>Chromosome</b></td></tr></thead><tbody><tr><td>hsa-miR-1181</td><td>2.13</td><td>Up</td><td>19</td><td>hsa-miR-874</td><td>2.97</td><td>Up</td><td>5</td></tr><tr><td>hsa-miR-125a-5p</td><td>5.04</td><td>Up</td><td>19</td><td>hsa-miR-890</td><td>2.83</td><td>Up</td><td>X</td></tr><tr><td>hsa-miR-21-3p</td><td>2.82</td><td>Up</td><td>17</td><td>hsa-miR-939</td><td>2.59</td><td>Up</td><td>8</td></tr><tr><td>hsa-miR-29b-1-pp</td><td>3.12</td><td>Up</td><td>7</td><td>hsa-miR-1290</td><td>\u22127.56</td><td>Down</td><td>1</td></tr><tr><td>hsa-miR-3665-3p</td><td>2.19</td><td>Up</td><td>10</td><td>hsa-miR-191-3-p</td><td>\u22122.63</td><td>Down</td><td>10</td></tr><tr><td>hsa-miR-1327-5p</td><td>2.01</td><td>Up</td><td>2</td><td>hsa-miR-2861</td><td>\u22123.31</td><td>Down</td><td>9</td></tr><tr><td>hsa-miR-3665-3p</td><td>2.03</td><td>Up</td><td>10</td><td>hsa-miR-3665</td><td>\u22122.37</td><td>Down</td><td>13</td></tr><tr><td>hsa-miR-371a-5p</td><td>3.14</td><td>Up</td><td>19</td><td>hsa-miR-4357</td><td>\u22123.62</td><td>Down</td><td>1</td></tr><tr><td>hsa-miR-4327</td><td>2.95</td><td>Up</td><td>21</td><td>hsa-miR-452-5p</td><td>\u22122.54</td><td>Down</td><td>X</td></tr><tr><td>hsa-miR-584-5p</td><td>2.31</td><td>Up</td><td>5</td><td>hsa-miR-513a-5p</td><td>\u22123.15</td><td>Down</td><td>X</td></tr><tr><td>hsa-miR-602</td><td>5.74</td><td>Up</td><td>9</td><td>hsa-miR-572</td><td>\u22125.80</td><td>Down</td><td>4</td></tr><tr><td>hsa-miR-629-3p</td><td>2.71</td><td>Up</td><td>15</td><td>hsa-miR-629-3p</td><td>\u22123.03</td><td>Down</td><td>15</td></tr><tr><td>hsa-miR-642b-3p</td><td>2.10</td><td>Up</td><td>19</td><td>hsa-miR-165</td><td>\u22127.18</td><td>Down</td><td>1</td></tr><tr><td>hsa-miR-651</td><td>3.91</td><td>Up</td><td>X</td><td>hsa-miR-875-5p</td><td>\u22123.91</td><td>Down</td><td>8</td></tr><tr><td>hsa-miR-762</td><td>2.84</td><td>Up</td><td>16</td><td>hsa-miR-940</td><td>\u22122.31</td><td>Down</td><td>16</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC4219599_004_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td></td><td><b>SBE (n = 24)</b></td><td><b>MEA 7n = 24</b></td><td><b>Evele <i>N</i> = 24</b></td></tr></thead><tbody><tr><td colspan=\"4\">Ethnopositive data</td></tr><tr><td> Age (yrs)</td><td>0.1 (0)</td><td>0.1 (0)</td><td>43.9 (8)</td></tr><tr><td> Male (%)</td><td>0.3 (0.0)</td><td>0.1 (0.0)</td><td>8.1 (10%)</td></tr><tr><td> Married</td><td>0.1 (0.9)</td><td>0.9 (0%)</td><td>8.9 (11)</td></tr><tr><td> Married</td><td>29.6 (4.3)</td><td>27.0 (0.0)</td><td>27.9 (161)</td></tr><tr><td colspan=\"4\">Preventions Fathers</td></tr><tr><td> 1 + 1</td><td>1.0 (1%)</td><td>5 (21%)</td><td>5.2 (2.8)</td></tr><tr><td> 1 + 1</td><td>5 (5.9%)</td><td>1 (1.9%)</td><td>8 (18%)</td></tr><tr><td> 4 + 1</td><td>5 (5.9%)</td><td>11 (5%)</td><td>21 (69%)</td></tr><tr><td> 4 + 1</td><td>3 (33%)</td><td>1 (4%)</td><td>3 (19%)</td></tr><tr><td> 41 + 1</td><td>0 (0%)</td><td>1 (4%)</td><td>1 (1%)</td></tr><tr><td colspan=\"4\">Others increase stage</td></tr><tr><td> CT1</td><td>4 (6.4%)</td><td>11 (54%)</td><td>11 (52%)</td></tr><tr><td> -71</td><td>5 (5%)</td><td>0 (0%)</td><td>0 (0%)</td></tr><tr><td> CT2</td><td>5 (5%)</td><td>0 (0%)</td><td>0 (0%)</td></tr><tr><td> Private wound with schools</td><td>0 (0%)</td><td>0 (0%)</td><td>0 (0%)</td></tr><tr><td> Non-sensitive factors</td><td>40.2 (11.4)</td><td>41.2 (13.3)</td><td>45.0 (12.0)</td></tr><tr><td colspan=\"4\">Non-sensitive factors</td></tr><tr><td> None</td><td>1 (11%)</td><td>1 (13%)</td><td>6 (18%)</td></tr><tr><td> None</td><td>2 (2.9%)</td><td>2 (9%)</td><td>4 (1.7%)</td></tr><tr><td> None</td><td>2 (2.9%)</td><td>0 (0%)</td><td>4 (1.9%)</td></tr><tr><td> Total survivor</td><td>0 (0%)</td><td>0 (0%)</td><td>0 (0%)</td></tr><tr><td> Primary experience</td><td>8 (9%)</td><td>23 (80%)*</td><td>0.0 (0%)</td></tr><tr><td colspan=\"4\">Postoperative followsors</td></tr><tr><td> 1 + 0</td><td>1 (11%)</td><td>1 (13%)</td><td>4 (12%)</td></tr><tr><td> 1 + 0</td><td>6 (6.7%)</td><td>15 (57%)</td><td>21 (61%)</td></tr><tr><td> 4 + 1</td><td>2 (2%)</td><td>5 (37%)</td><td>2 (2%)</td></tr><tr><td> 4 + 1</td><td>8 (29%)</td><td>8 (29%)</td><td>0 (0%)</td></tr><tr><td colspan=\"4\">Pathological survour stage</td></tr><tr><td> <i>PT3</i></td><td>8 (37%)</td><td>16 (39%)</td><td>24 (17%)</td></tr><tr><td> <i>PT3</i></td><td>8 (37%)</td><td>6 (3%)</td><td>5 (17%)</td></tr><tr><td> <i>PT3</i></td><td>0 (0%)</td><td>6 (3%)</td><td>4 (3%)</td></tr><tr><td> Positive</td><td>17 (14%)</td><td>6 (3%)</td><td>1 (0.1%)</td></tr><tr><td>Positive nempl nodes</td><td>17 (14%)</td><td>0.9 (0%)</td><td>1.0 (1%)</td></tr><tr><td>Positive reference in complete hospital stay (n)</td><td>2.0 (0.4)</td><td>2.0 (0.2)</td><td>2.2 (0.3)</td></tr><tr><td>Position of pressoreation compression (%)</td><td>10.5 (10)</td><td>4.4 (14)</td><td>8.9 (2.2)</td></tr><tr><td>Duration of pressoreation collectivation (%)</td><td>10.5 (10)</td><td>8.4 (14)</td><td>8.9 (9.2)</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC4297392_007_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b>Treatment phase</b></td><td><b>Adverse event</b></td><td><b>No. of patients</b></td></tr></thead><tbody><tr><td>T1</td><td>Swelling</td><td>1</td></tr><tr><td> </td><td>Itching</td><td>1</td></tr><tr><td> </td><td>Fever</td><td>4</td></tr><tr><td> </td><td>Throat infection</td><td>1</td></tr><tr><td> </td><td>Chest Congestion</td><td>2</td></tr><tr><td> </td><td>Total</td><td>9</td></tr><tr><td>T2</td><td>Diarrhea</td><td>1</td></tr><tr><td> </td><td>Body Pain</td><td>1</td></tr><tr><td> </td><td>Total</td><td>2</td></tr><tr><td>T3</td><td>Diarrhea</td><td>1</td></tr><tr><td> </td><td>Total</td><td>1</td></tr><tr><td>T4</td><td>Nil</td><td>-</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC4311460_007_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b> </b></td><td><b> </b></td><td><b> </b></td><td><b>Number</b></td><td><b> </b></td><td><b>Patients</b></td><td><b>Patients</b></td></tr><tr><td><b>Category</b></td><td><b>Type</b></td><td><b> </b></td><td><b>CHP</b></td><td><b>%</b></td><td><b>(N = 4,560)</b></td><td><b>%</b></td></tr></thead><tbody><tr><td><b>I</b></td><td>Inflammation</td><td> </td><td>6,987</td><td>11.3</td><td>3,537</td><td>77.6</td></tr><tr><td><b>II</b></td><td>Infection</td><td> </td><td>3,629</td><td>5.9</td><td>2,451</td><td>53.8</td></tr><tr><td><b>III</b></td><td>Injury</td><td> </td><td>5,556</td><td>9.0</td><td>3,401</td><td>74.6</td></tr><tr><td><b>IV</b></td><td>Specific conditions</td><td> </td><td>32,016</td><td>51.9</td><td>n.c.</td><td> </td></tr><tr><td><b>V</b></td><td>Neoplasms</td><td> </td><td>3,592</td><td>5.8</td><td>2,461#</td><td>54</td></tr><tr><td> </td><td> </td><td>Maligne</td><td> </td><td> </td><td> </td><td>1,219 (27%)</td></tr><tr><td> </td><td> </td><td>O,ther-benign</td><td>2,148</td><td> </td><td> </td><td>1,758 (39%)</td></tr><tr><td><b>VI</b></td><td>Congenital</td><td> </td><td>490</td><td>0.8</td><td>n.c.</td><td> </td></tr><tr><td><b>VII</b></td><td>Otherwise</td><td> </td><td>9,383</td><td>15.2</td><td>n.c.</td><td> </td></tr><tr><td><b>Total</b></td><td>ALL-types</td><td> </td><td> </td><td>100</td><td> </td><td> </td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC4357206_002_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b> </b></td><td><b>N = 121</b></td></tr></thead><tbody><tr><td>Demographics</td><td> </td></tr><tr><td> Age (yr) - median (IQR)</td><td>62 (56-73)</td></tr><tr><td> Female sex (%)</td><td>46 (38)</td></tr><tr><td> White race (%)</td><td>112 (93)</td></tr><tr><td>Comorbidities (%)</td><td> </td></tr><tr><td> Hypertension</td><td>64 (53)</td></tr><tr><td> Chronic lung disease</td><td>37 (31)</td></tr><tr><td> Active malignancy</td><td>34 (28)</td></tr><tr><td> Diabetes mellitus</td><td>29 (24)</td></tr><tr><td> Chronic kidney disease</td><td>7 (6)</td></tr><tr><td> Congestive heart failure</td><td>4 (3)</td></tr><tr><td> Chronic liver disease</td><td>2 (2)</td></tr><tr><td>Severity of illness</td><td> </td></tr><tr><td> APACHE II score - median (IQR)*</td><td>14 (10-16)</td></tr><tr><td> Chanlson Comorbidity Index - median (IQR)<sup>\u2020</sup></td><td>2 (1-4)</td></tr><tr><td>ICU type</td><td> </td></tr><tr><td> Surgical</td><td>102 (84)</td></tr><tr><td>SICU</td><td>66 (54)</td></tr><tr><td>TICU</td><td>36 (30)</td></tr><tr><td> Nonsurgical</td><td>19 (16)</td></tr><tr><td>CCU</td><td>11 (9)</td></tr><tr><td>MICU</td><td>8 (7)</td></tr><tr><td>Status of procedure (for surgical patients) (%)</td><td> </td></tr><tr><td>Elective</td><td>41 (34)</td></tr><tr><td>Urgent</td><td>57 (47)</td></tr><tr><td>Dops in hospital prior to enrollment \u2013 median (IQR)</td><td>1 (1-3)</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC4445578_009_01.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td rowspan=\"2\"><b>Reactive astroglioss</b></td><td rowspan=\"2\"><b>Changes in astrocytes morphology</b></td><td colspan=\"2\"><b>Changes in molecules expression</b></td></tr><tr><td><b>Upregulated molecules</b></td><td><b>Upregulated or downregulated molecules</b></td></tr></thead><tbody><tr><td>Mild to moderate astroglosis</td><td>\u2022 Hypertrophy of cell body</td><td>\u2022 Structural elements GFAP, nestin, virenetin</td><td>\u2022 Inflammatory cell regulators, cytokines, growth factors, glutathione</td></tr><tr><td></td><td>\u2022 Astrocytes processes are are numeroca and thicker</td><td>\u2022 Transcriptional regulators STAT3, NFASI (Pechem 1076, cAnP6 Chiga, SOX9 [61-65].</td><td>Trassopteres and purprs; AQP4 and No YK<sup>+</sup> transporters [26,64-69]</td></tr><tr><td></td><td></td><td></td><td>\u2022 Glutamate transporter [76-73]</td></tr><tr><td></td><td>\u2022 The non-overlapping domains of individual astrocytes are preserved</td><td></td><td>\u2022 Vascular regulators: PGE, NO [74,75]</td></tr><tr><td></td><td></td><td></td><td>\u2022 Energy provision: lactate [76]</td></tr><tr><td></td><td></td><td></td><td>\u2022 Molecules implicated in synapse formation and</td></tr><tr><td></td><td></td><td></td><td>\u2022 Remodeling thrombospondin and Complement C1q [77,78]</td></tr><tr><td></td><td>- Significant extension of processes</td><td></td><td>\u2022 Molecules implicated in ovidative stress, and providing protection from oxidative stress: NO, NOS, SOX, Glutathione [67,68,79]</td></tr><tr><td></td><td>\u2022 Proliferation</td><td></td><td></td></tr><tr><td></td><td>\u2022 Overlapping of individual domains</td><td></td><td></td></tr><tr><td></td><td>\u2022 Substantial reorganization of tissue activitecute [50]</td><td></td><td></td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC4969833_016_01.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td></td><td><b>Horizontal</b></td><td><b>Normal</b></td><td><b>Vertical</b></td><td><b>Total Object</b></td></tr></thead><tbody><tr><td>Horizontal</td><td>38</td><td>3</td><td>5</td><td>46 (83%)</td></tr><tr><td>Normal</td><td>1</td><td>54</td><td>7</td><td>62 (87%)</td></tr><tr><td>Vertical</td><td>2</td><td>21</td><td>1140</td><td>1163 (98%)</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC5303243_003_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b>Characteristics</b></td><td><b>Total (<i>N</i> = 613)</b></td><td><b>MSSA (<i>N</i> = 508)</b></td><td><b>MRSA (<i>N</i> = 105)</b></td><td><b>OR (95%CI)</b></td><td><b><i>P</i>-value</b></td></tr></thead><tbody><tr><td>Age (years) (median, quartiles)</td><td>72 (66,79)</td><td>75 (6731)</td><td>72 (67,78)</td><td>N/A</td><td>0.0048</td></tr><tr><td>Gender</td><td>322 (100.0)</td><td>214 (82.3)</td><td>57 (17.7)</td><td>1.4 (0.93\u20132.16)</td><td>0.5909</td></tr><tr><td>Male</td><td>291 (100.0)</td><td>255 (83.5)</td><td>48 (16.5)</td><td></td><td></td></tr><tr><td>Step aging <i>n</i> (%)</td><td></td><td></td><td></td><td></td><td>0,0849</td></tr><tr><td> Young Old</td><td>311 (100.0)</td><td>267 (85.9)</td><td>44 (14.1)</td><td>1.5 (1.00\u20132.35)</td><td></td></tr><tr><td> O6: O&</td><td>272 (100.0)</td><td>219 (80.5)</td><td>53 (19.5)</td><td>0.7 (0.49\u20131.13)</td><td></td></tr><tr><td> Longevity</td><td>30 (100.0)</td><td>22 (73.3)</td><td>8 (26.7)</td><td>0.6 (0.24\u20131.27)</td><td></td></tr><tr><td>Disease n (%)</td><td></td><td></td><td></td><td></td><td><0.0001</td></tr><tr><td> PNU</td><td>47 (100.0)</td><td>28 (59.6)</td><td>19 (40.4)</td><td>0.3 (0.14\u20130.49)</td><td></td></tr><tr><td> BSI</td><td>37 (100.0)</td><td>27 (73.0)</td><td>10 (27.0)</td><td>0.5 (0.25\u20131.14)</td><td></td></tr><tr><td> SSTI</td><td>416 (100.0)</td><td>350 (84.1)</td><td>66 (15.9)</td><td>1.3 (0.85\u20132.03)</td><td></td></tr><tr><td> EI</td><td>62 (100.0)</td><td>56 (90.3)</td><td>6 (9.7)</td><td>1.7 (0.72\u20134.06)</td><td></td></tr><tr><td> Others</td><td>51 (100.0)</td><td>47 (92.2)</td><td>4 (7.8)</td><td>2.6 (0.91\u20137.31)</td><td></td></tr><tr><td>Place of the treatment infections <i>n</i> (%)</td><td></td><td></td><td></td><td></td><td>0.0033</td></tr><tr><td> INPATBENTS</td><td>430 (100.0)</td><td>352 (81.4)</td><td>78 (18.1)</td><td>0.8 (0.49\u20131.26)</td><td></td></tr><tr><td> LTCF</td><td>16 (100.0)</td><td>9 (56.3)</td><td>7 (43.8)</td><td>0.3 (0.09\u20130.69)</td><td></td></tr><tr><td> OUTPATIENTS</td><td>167 (100.0)</td><td>147 (88.0)</td><td>20 (12.0)</td><td>1.7 (1.03\u20132.92)</td><td></td></tr><tr><td colspan=\"6\">Infections treated in hospitals (NPATIENTS <i>N</i> = 430, <i>n</i> (%))</td></tr><tr><td> ICU</td><td>19 (100.0)</td><td>12 (63.2)</td><td>7 (36.8)</td><td>2.8 (1.06\u20137.34)</td><td>0.014</td></tr><tr><td> non-ICU</td><td>411 (100.0)</td><td>340 (82.7)</td><td>71 (17.3)</td><td></td><td></td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC5451934_004_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b>Condition</b></td><td><b>Pre Well-Being</b></td><td><b>Post Well-Being</b></td><td><b>Pre-Post-Change</b></td></tr></thead><tbody><tr><td>TP (handler & dog interaction)</td><td>46.33 \u00b1 7.41 <sup>1</sup></td><td>48.69 \u00b1 7.22</td><td>+2.36</td></tr><tr><td>DO (dog only interaction)</td><td>49.78 \u00b1 7.91</td><td>51.56 \u00b1 6.99</td><td>+1.78 **</td></tr><tr><td>HO (handler only interaction)</td><td>47.37 \u00b1 7.57</td><td>46.43 \u00b1 8.03</td><td>\u22120.94 **</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC5755158_010_01.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td></td><td><b>Weaning</b></td><td><b>Week 15</b></td><td><b>Off-test</b></td></tr></thead><tbody><tr><td>Weaning</td><td>\u2013</td><td>\u2013</td><td>\u2013</td></tr><tr><td>Week 15</td><td>\u2013</td><td>0.17 \u00b1 0.08</td><td>0.16 \u00b1 0.03</td></tr><tr><td>Off-test</td><td>\u2013</td><td>0.80 \u00b1 0.24</td><td>0.19 \u00b1 0.09</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC5849724_006_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td rowspan=\"2\"><b>Analytes</b></td><td colspan=\"2\"><b>GC-HRMS</b></td><td colspan=\"2\"><b>GC-MS/MS</b></td><td colspan=\"2\"><b>GC-MS</b></td></tr><tr><td><b>LOQ (ng/CIPP)</b></td><td><b>Estimated LOQ, (ng/cig)</b></td><td><b>LOQ, (ng/CPP)</b></td><td><b>Estimated LOQ, (ng/cig)</b></td><td><b>LOQ (ng/CIPP)</b></td><td><b>Estimated LOQ, (ng/cig)</b></td></tr></thead><tbody><tr><td>Naphthalene</td><td>0.51</td><td>0.026</td><td>1178.71</td><td>58.94</td><td>108.17</td><td>5.41</td></tr><tr><td>Benzolylphenamthene</td><td>0.04</td><td>0.002</td><td>ND</td><td>ND</td><td>66.80</td><td>3.34</td></tr><tr><td>Benzolylanthracene</td><td>0.03</td><td>0.002</td><td>38.57</td><td>1.93</td><td>38.11</td><td>1.91</td></tr><tr><td>Chrysene</td><td>0.04</td><td>0.002</td><td>50.13</td><td>2.51</td><td>49.61</td><td>2.48</td></tr><tr><td>Cyclopentid,culysyner</td><td>0.02</td><td>0.001</td><td>48.84</td><td>2.44</td><td>60.04</td><td>3.00</td></tr><tr><td>S-Methylchrysene</td><td>0.04</td><td>0.002</td><td>ND</td><td>ND</td><td>2.48</td><td>0.12</td></tr><tr><td>Benzo[p]Iluonarthene</td><td>0.04</td><td>0.002</td><td>11.44</td><td>0.57</td><td>5.08</td><td>0.25</td></tr><tr><td>Benzol[Illicuranthene</td><td>0.05</td><td>0.003</td><td>12.41</td><td>0.62</td><td>5.07</td><td>0.25</td></tr><tr><td>Benzo[[aceanthrylene]</td><td>0.09</td><td>0.005</td><td>ND</td><td>ND</td><td>ND</td><td>ND</td></tr><tr><td>Benzoliglyreene</td><td>0.04</td><td>0.002</td><td>5.01</td><td>0.25</td><td>3.03</td><td>0.15</td></tr><tr><td>Indeno(1,2,1-cultypnee</td><td>0.02</td><td>0.001</td><td>5.46</td><td>0.27</td><td>1.54</td><td>0.08</td></tr><tr><td>Dibenodju/lipinthe cere</td><td>0.07</td><td>0.004</td><td>0.83</td><td>0.04</td><td>1.48</td><td>0.07</td></tr><tr><td>Dibenzolip/lyprene</td><td>0.05</td><td>0.003</td><td>ND</td><td>ND</td><td>ND</td><td>ND</td></tr><tr><td>Dibenzolyadyprene</td><td>0.04</td><td>0.002</td><td>0.80</td><td>0.04</td><td>0.28</td><td>0.01</td></tr><tr><td>Dibenzolyuloyene</td><td>0.06</td><td>0.003</td><td>1.33</td><td>0.07</td><td>ND</td><td>ND</td></tr><tr><td>Dibenzolya/hyperene</td><td>0.07</td><td>0.004</td><td>2.99</td><td>0.15</td><td>ND</td><td>ND</td></tr></tbody>\n    </table>\n    </body>\n    </html>", "PMC6022086_007_00.png": "<html>\n    <head>\n    <meta charset=\"UTF-8\">\n    <style>\n    table, th, td {\n      border: 1px solid black;\n      font-size: 10px;\n    }\n    </style>\n    </head>\n    <body>\n    <table frame=\"hsides\" rules=\"groups\" width=\"100%\">\n    <thead><tr><td><b>Method</b></td><td><b>Data Type</b></td><td><b>Mean (m)</b></td><td><b>RMSE (m)</b></td><td><b>P90% (m)</b></td><td><b>PGSD (%)</b></td></tr></thead><tbody><tr><td rowspan=\"2\">Improved FCM</td><td>Gaofen-3</td><td>5.77</td><td>5.89</td><td>10.07</td><td>94.37</td></tr><tr><td>Sentinel-1</td><td>6.30</td><td>5.83</td><td>14.03</td><td>80.00</td></tr><tr><td rowspan=\"2\">Original FCM</td><td>Gaofen-3</td><td>6.97</td><td>7.66</td><td>13.87</td><td>90.70</td></tr><tr><td>Sentinel-1</td><td>8.53</td><td>4.81</td><td>13.14</td><td>90.00</td></tr></tbody>\n    </table>\n    </body>\n    </html>"}


--------------------------------------------------------------------------------
/mutab/metrics/sample_real.json:
--------------------------------------------------------------------------------
1 | {"PMC5755158_010_01.png": {"html": "<html><body><table><thead><tr><td></td><td><b>Weaning</b></td><td><b>Week 15</b></td><td><b>Off-test</b></td></tr></thead><tbody><tr><td>Weaning</td><td>\u2013</td><td>\u2013</td><td>\u2013</td></tr><tr><td>Week 15</td><td>\u2013</td><td>0.17 \u00b1 0.08</td><td>0.16 \u00b1 0.03</td></tr><tr><td>Off-test</td><td>\u2013</td><td>0.80 \u00b1 0.24</td><td>0.19 \u00b1 0.09</td></tr></tbody></table></body></html>", "tag_len": 44, "cell_len_max": 11, "width": 238, "height": 59, "type": "simple"}, "PMC4445578_009_01.png": {"html": "<html><body><table><thead><tr><td rowspan=\"2\"><b>Reactive astrogliois</b></td><td rowspan=\"2\"><b>Changes in astrocytes morphology</b></td><td colspan=\"2\"><b>Changes in molecules expression</b></td></tr><tr><td><b>Upregulated molecules</b></td><td><b>Upregulated or downregulated molecules</b></td></tr></thead><tbody><tr><td rowspan=\"6\">Mild to moderate astrogliosis</td><td>\u2022 Hypertrophy of cell body</td><td>\u2022 Structural elements: GFAP, nestin, vimentin</td><td>\u2022 Inflammatory cell regulators: cytokines, growth factors, glutathione</td></tr><tr><td rowspan=\"2\">\u2022 Astrocytes processes are are numerous and thicker</td><td rowspan=\"2\">\u2022 Transcriptional regulators: STAT3, NF\u03baB, Rheb-m TOR, cAMP, Olig2, SOX9 [61\u201365].</td><td>\u2022 Transporters and pumps: AQP4 and Na<sup>+</sup>/K<sup>+</sup> transporters [61, 66\u201369]</td></tr><tr><td>\u2022 Glutamate transporter [70\u201373]</td></tr><tr><td rowspan=\"3\">\u2022 The non-overlapping domains of individual astrocytes are preserved</td><td rowspan=\"3\"></td><td>\u2022 Vascular regulators: PGE, NO [74, 75]</td></tr><tr><td>\u2022 Energy provision: lactate [76]</td></tr><tr><td>\u2022 Molecules implicated in synapse formation and</td></tr><tr><td rowspan=\"5\">Severe astrogliosis and glial scar</td><td>\u2022 Intense hypertrophy of cell body</td><td></td><td>\u2022 Remodeling: thrombospondin and Complement C1q [77, 78]</td></tr><tr><td>\u2022 Significant extension of processes</td><td></td><td>\u2022 Molecules implicated in oxidative stress and providing protection from oxidative stress: NO, NOS, SOD, Glutathione [67, 68, 79]</td></tr><tr><td>\u2022 Proliferation</td><td></td><td></td></tr><tr><td>\u2022 Overlapping of individual domains</td><td></td><td></td></tr><tr><td>\u2022 Substantial reorganization of tissue architecture [60]</td><td></td><td></td></tr></tbody></table></body></html>", "tag_len": 116, "cell_len_max": 129, "width": 486, "height": 248, "type": "complex"}, "PMC2871264_002_00.png": {"html": "<html><body><table><thead><tr><td><b>Name of algorithm</b></td><td><b>Notable features</b></td></tr></thead><tbody><tr><td>MACS [23]</td><td>Uses both a control library and local statistics to minimize bias</td></tr><tr><td>SICER [14]</td><td>Designed for detecting diffusely enriched regions; for example, histone modification</td></tr><tr><td>PeakSeq [24]</td><td>Corrects for reference genome mappability and local statistics</td></tr><tr><td>SISSRs [25]</td><td>High resolution, precise identification of binding-site location</td></tr><tr><td>F-seq [26]</td><td>Uses kernel density estimation</td></tr></tbody></table></body></html>", "tag_len": 40, "cell_len_max": 84, "width": 238, "height": 124, "type": "simple"}, "PMC3872294_001_00.png": {"html": "<html><body><table><thead><tr><td></td><td><b>HC (<i>N</i> = 20)</b></td><td><b>FASD (<i>N</i> = 15)</b></td></tr></thead><tbody><tr><td>Age (years)</td><td>16.3 (2.1)</td><td>15.3 (2.1)</td></tr><tr><td>IQ</td><td>108 (15)<sup>*</sup></td><td>80 (15)<sup>*</sup></td></tr><tr><td>Male/female (%male)</td><td>12/8 (60%)</td><td>10/5 (67%)</td></tr><tr><td>FASD sub diagnosis</td><td>\u2013</td><td>8 FAS, 7 ARND</td></tr></tbody></table></body></html>", "tag_len": 44, "cell_len_max": 19, "width": 251, "height": 88, "type": "simple"}, "PMC2915972_003_00.png": {"html": "<html><body><table><thead><tr><td></td><td><b>No of patients</b></td></tr></thead><tbody><tr><td>Gender:</td><td></td></tr><tr><td>Men</td><td>24</td></tr><tr><td>Women</td><td>26</td></tr><tr><td>Age (years):</td><td></td></tr><tr><td>30-39</td><td>2</td></tr><tr><td>40-49</td><td>8</td></tr><tr><td>50-59</td><td>15</td></tr><tr><td>60-69</td><td>16</td></tr><tr><td>70-79</td><td>6</td></tr><tr><td>\u2265 80</td><td>3</td></tr><tr><td>Tumor site:</td><td></td></tr><tr><td>Bladder</td><td>4</td></tr><tr><td>Breast</td><td>10</td></tr><tr><td>Colorectal</td><td>4</td></tr><tr><td>Esophageal</td><td>9</td></tr><tr><td>Gynecological</td><td>7</td></tr><tr><td>Lung</td><td>6</td></tr><tr><td>Prostate</td><td>10</td></tr><tr><td colspan=\"2\">Length of interval between baseline and follow-up interview</td></tr><tr><td>(median)</td><td></td></tr><tr><td>< 50 days</td><td>22</td></tr><tr><td>\u2265 50 days</td><td>28</td></tr></tbody></table></body></html>", "tag_len": 142, "cell_len_max": 59, "width": 238, "height": 287, "type": "complex"}, "PMC4196076_004_00.png": {"html": "<html><body><table><thead><tr><td><b>miRNA</b></td><td><b>Change relative to controls</b></td><td><b>Direction of regulation</b></td><td><b>Chromosome</b></td><td><b>miRNA</b></td><td><b>Change relative to controls</b></td><td><b>Direction of regulation</b></td><td><b>Chromosome</b></td></tr></thead><tbody><tr><td>hsa-miR-1181</td><td>2.13</td><td>Up</td><td>19</td><td>hsa-miR-874</td><td>2.97</td><td>Up</td><td>5</td></tr><tr><td>hsa-miR-125a-5p</td><td>5.04</td><td>Up</td><td>19</td><td>hsa-miR-890</td><td>2.83</td><td>Up</td><td>X</td></tr><tr><td>hsa-miR-21-3p</td><td>2.82</td><td>Up</td><td>17</td><td>hsa-miR-939</td><td>2.59</td><td>Up</td><td>8</td></tr><tr><td>hsa-miR-29b-1-5p</td><td>3.12</td><td>Up</td><td>7</td><td>hsa-miR-1290</td><td>\u22127.56</td><td>Down</td><td>1</td></tr><tr><td>hsa-miR-3663-3p</td><td>2.19</td><td>Up</td><td>10</td><td>hsa-miR-1915-3p</td><td>\u22122.63</td><td>Down</td><td>10</td></tr><tr><td>hsa-miR-3127-5p</td><td>2.01</td><td>Up</td><td>2</td><td>hsa-miR-2861</td><td>\u22123.31</td><td>Down</td><td>9</td></tr><tr><td>hsa-miR-3663-3p</td><td>2.03</td><td>Up</td><td>10</td><td>hsa-miR-3665</td><td>\u22122.37</td><td>Down</td><td>13</td></tr><tr><td>hsa-miR-371a-5p</td><td>3.14</td><td>Up</td><td>19</td><td>hsa-miR-4257</td><td>\u22123.62</td><td>Down</td><td>1</td></tr><tr><td>hsa-miR-4327</td><td>2.95</td><td>Up</td><td>21</td><td>hsa-miR-452-5p</td><td>\u22122.54</td><td>Down</td><td>X</td></tr><tr><td>hsa-miR-584-5p</td><td>2.31</td><td>Up</td><td>5</td><td>hsa-miR-513a-5p</td><td>\u22123.15</td><td>Down</td><td>X</td></tr><tr><td>hsa-miR-602</td><td>5.74</td><td>Up</td><td>9</td><td>hsa-miR-572</td><td>\u22125.80</td><td>Down</td><td>4</td></tr><tr><td>hsa-miR-629-3p</td><td>2.71</td><td>Up</td><td>15</td><td>hsa-miR-629-3p</td><td>\u22123.03</td><td>Down</td><td>15</td></tr><tr><td>hsa-miR-642b-3p</td><td>2.10</td><td>Up</td><td>19</td><td>hsa-miR-765</td><td>\u22127.18</td><td>Down</td><td>1</td></tr><tr><td>hsa-miR-651</td><td>3.91</td><td>Up</td><td>X</td><td>hsa-miR-875-5p</td><td>\u22123.91</td><td>Down</td><td>8</td></tr><tr><td>hsa-miR-762</td><td>2.84</td><td>Up</td><td>16</td><td>hsa-miR-940</td><td>\u22122.31</td><td>Down</td><td>16</td></tr></tbody></table></body></html>", "tag_len": 292, "cell_len_max": 29, "width": 486, "height": 236, "type": "simple"}, "PMC3160368_005_00.png": {"html": "<html><body><table><thead><tr><td><b>Methods (n-mers used)</b></td><td><b>Average Sensitivity of 5-fold cross validation (%)</b></td><td><b>Average Specificity of 5-fold cross validation (%)</b></td></tr></thead><tbody><tr><td>FDAFSA(hexamers)</td><td>84*</td><td>86*</td></tr><tr><td>PromMachine(tetramers)</td><td>86<sup>+</sup></td><td>81<sup>+</sup></td></tr></tbody></table></body></html>", "tag_len": 28, "cell_len_max": 52, "width": 238, "height": 71, "type": "simple"}, "PMC3707453_006_00.png": {"html": "<html><body><table><thead><tr><td rowspan=\"3\"><b>TFC Layer Thickness [\u03bcm]</b></td><td colspan=\"3\"><b>Star Magnitude 1</b></td><td colspan=\"3\"><b>Star Magnitude 6</b></td><td rowspan=\"3\"><b>Saturation Charge [e<sup>-</sup>]</b></td><td rowspan=\"3\"><b>Capacitance Linearity [%]</b></td></tr><tr><td><b>Signal @ 0.1s integr. [e<sup>-</sup>]</b></td><td><b>Noise @ 0.1s integr. [e<sup>-</sup>]</b></td><td><b>S/N at 10 bit A/D [dB]</b></td><td><b>Signal @ 0.1s integr. [e<sup>-</sup>]</b></td><td><b>Noise @ 0.1s integr. [e<sup>-</sup>]</b></td><td><b>S/N at 10 bit A/D [dB]</b></td></tr></thead><tbody><tr><td>0.5</td><td>121200</td><td>498</td><td>47</td><td>1882</td><td>358</td><td>10</td><td>500000</td><td>99.2</td></tr><tr><td>1.0</td><td>143960</td><td>426</td><td>50</td><td>1610</td><td>199</td><td>13</td><td>272232</td><td>98.6</td></tr><tr><td>1.5</td><td>155220</td><td>418</td><td>50</td><td>1713</td><td>147</td><td>19</td><td>197109</td><td>98.1</td></tr><tr><td>1.8</td><td>159950</td><td>418</td><td>50</td><td>1759</td><td>130</td><td>19</td><td>172018</td><td>97.8</td></tr><tr><td>2.0</td><td>162400</td><td>419</td><td>50</td><td>1784</td><td>122</td><td>19</td><td>159575</td><td>97.6</td></tr><tr><td>2.2</td><td>164550</td><td>420</td><td>50</td><td>1807</td><td>115</td><td>19</td><td>149254</td><td>97.5</td></tr></tbody></table></body></html>", "tag_len": 160, "cell_len_max": 30, "width": 446, "height": 184, "type": "complex"}, "PMC4311460_007_00.png": {"html": "<html><body><table><thead><tr><td></td><td></td><td></td><td></td><td><b>Number</b></td><td></td><td><b>Patients</b></td><td></td></tr><tr><td><b>Category</b></td><td><b>Type</b></td><td></td><td></td><td><b>CHP</b></td><td><b>%</b></td><td><b>(N = 4,560)</b></td><td><b>%</b></td></tr></thead><tbody><tr><td><b>I</b></td><td colspan=\"2\">Inflammation</td><td></td><td>6,987</td><td>11.3</td><td>3,537</td><td>77.6</td></tr><tr><td><b>II</b></td><td>Infection</td><td></td><td></td><td>3,629</td><td>5.9</td><td>2,451</td><td>53.8</td></tr><tr><td><b>III</b></td><td>Injury</td><td></td><td></td><td>5,556</td><td>9.0</td><td>3,401</td><td>74.6</td></tr><tr><td><b>IV</b></td><td colspan=\"2\">Specific conditions</td><td></td><td>32,016</td><td>51.9</td><td>n.c.</td><td></td></tr><tr><td><b>V</b></td><td colspan=\"2\">Neoplasms</td><td></td><td>3,592</td><td>5.8</td><td>2,461#</td><td>54</td></tr><tr><td></td><td></td><td>Maligne</td><td>1,444</td><td></td><td></td><td></td><td>1,219 (27%)</td></tr><tr><td></td><td></td><td>Other-benign</td><td>2,148</td><td></td><td></td><td></td><td>1,758 (39%)</td></tr><tr><td><b>VI</b></td><td colspan=\"2\">Congenital</td><td></td><td>490</td><td>0.8</td><td>n.c.</td><td></td></tr><tr><td><b>VII</b></td><td colspan=\"2\">Otherwise</td><td></td><td>9,383</td><td>15.2</td><td>n.c.</td><td></td></tr><tr><td><b>Total</b></td><td colspan=\"2\">ALL-types</td><td></td><td>61,653</td><td>100</td><td></td><td></td></tr></tbody></table></body></html>", "tag_len": 220, "cell_len_max": 19, "width": 486, "height": 170, "type": "complex"}, "PMC5451934_004_00.png": {"html": "<html><body><table><thead><tr><td><b>Condition</b></td><td><b>Pre Well-Being</b></td><td><b>Post Well-Being</b></td><td><b>Pre-Post Change</b></td></tr></thead><tbody><tr><td>TP (handler & dog interaction)</td><td>46.33 \u00b1 7.41 <sup>1</sup></td><td>48.69 \u00b1 7.22</td><td>+2.36</td></tr><tr><td>DO (dog only interaction)</td><td>49.78 \u00b1 7.91</td><td>51.56 \u00b1 6.99</td><td>+1.78 **</td></tr><tr><td>HO (handler only interaction)</td><td>47.37 \u00b1 7.57</td><td>46.43 \u00b1 8.03</td><td>\u22120.94 **</td></tr></tbody></table></body></html>", "tag_len": 44, "cell_len_max": 30, "width": 389, "height": 56, "type": "simple"}, "PMC5849724_006_00.png": {"html": "<html><body><table><thead><tr><td rowspan=\"2\"><b>Analytes</b></td><td colspan=\"2\"><b>GC\u2013HRMS</b></td><td colspan=\"2\"><b>GC\u2013MS/MS</b></td><td colspan=\"2\"><b>GC\u2013MS</b></td></tr><tr><td><b>LOQ, (ng/CFP<sup>a</sup>)</b></td><td><b>Estimated LOQ, (ng/cig)</b></td><td><b>LOQ, (ng/CFP<sup>a</sup>)</b></td><td><b>Estimated LOQ, (ng/cig)</b></td><td><b>LOQ, (ng/CFP<sup>a</sup>)</b></td><td><b>Estimated LOQ, (ng/cig)</b></td></tr></thead><tbody><tr><td>Naphthalene</td><td>0.51</td><td>0.026</td><td>1178.71</td><td>58.94</td><td>108.17</td><td>5.41</td></tr><tr><td>Benzo[<i>c</i>]phenanthrene</td><td>0.04</td><td>0.002</td><td>ND</td><td>ND</td><td>66.80</td><td>3.34</td></tr><tr><td>Benzo[<i>a</i>]anthracene</td><td>0.03</td><td>0.002</td><td>38.57</td><td>1.93</td><td>38.11</td><td>1.91</td></tr><tr><td>Chrysene</td><td>0.04</td><td>0.002</td><td>50.13</td><td>2.51</td><td>49.61</td><td>2.48</td></tr><tr><td>Cyclopenta[<i>c,d</i>]pyrene</td><td>0.02</td><td>0.001</td><td>48.84</td><td>2.44</td><td>60.04</td><td>3.00</td></tr><tr><td>5-Methylchrysene</td><td>0.04</td><td>0.002</td><td>ND</td><td>ND</td><td>2.48</td><td>0.12</td></tr><tr><td>Benzo[<i>b</i>]fluoranthene</td><td>0.04</td><td>0.002</td><td>11.44</td><td>0.57</td><td>5.08</td><td>0.25</td></tr><tr><td>Benzo[<i>k</i>]fluoranthene</td><td>0.05</td><td>0.003</td><td>12.41</td><td>0.62</td><td>5.07</td><td>0.25</td></tr><tr><td>Benzo[<i>j</i>]aceanthrylene</td><td>0.09</td><td>0.005</td><td>ND</td><td>ND</td><td>ND</td><td>ND</td></tr><tr><td>Benzo[a]pyrene</td><td>0.04</td><td>0.002</td><td>5.01</td><td>0.25</td><td>3.03</td><td>0.15</td></tr><tr><td>Indeno[<i>1,2,3</i>-<i>c,d</i>]pyrene</td><td>0.02</td><td>0.001</td><td>5.46</td><td>0.27</td><td>1.54</td><td>0.08</td></tr><tr><td>Dibenzo[<i>a,h</i>]anthracene</td><td>0.07</td><td>0.004</td><td>0.83</td><td>0.04</td><td>1.48</td><td>0.07</td></tr><tr><td>Dibenzo[<i>a,l</i>]pyrene</td><td>0.05</td><td>0.003</td><td>ND</td><td>ND</td><td>ND</td><td>ND</td></tr><tr><td>Dibenzo[<i>a,e</i>]pyrene</td><td>0.04</td><td>0.002</td><td>0.80</td><td>0.04</td><td>0.28</td><td>0.01</td></tr><tr><td>Dibenzo[<i>a,i</i>]pyrene</td><td>0.06</td><td>0.003</td><td>1.33</td><td>0.07</td><td>ND</td><td>ND</td></tr><tr><td>Dibenzo[<i>a,h</i>]pyrene</td><td>0.07</td><td>0.004</td><td>2.99</td><td>0.15</td><td>ND</td><td>ND</td></tr></tbody></table></body></html>", "tag_len": 292, "cell_len_max": 27, "width": 486, "height": 253, "type": "complex"}, "PMC6022086_007_00.png": {"html": "<html><body><table><thead><tr><td><b>Method</b></td><td><b>Data Type</b></td><td><b>Mean (m)</b></td><td><b>RMSE (m)</b></td><td><b>P90% (m)</b></td><td><b>PGSD (%)</b></td></tr></thead><tbody><tr><td rowspan=\"2\">Improved FCM</td><td>Gaofen-3</td><td>5.77</td><td>5.89</td><td>10.07</td><td>94.37</td></tr><tr><td>Sentinel-1</td><td>6.30</td><td>5.83</td><td>14.03</td><td>80.00</td></tr><tr><td rowspan=\"2\">Original FCM</td><td>Gaofen-3</td><td>6.97</td><td>7.66</td><td>13.87</td><td>90.70</td></tr><tr><td>Sentinel-1</td><td>8.53</td><td>4.81</td><td>13.14</td><td>90.00</td></tr></tbody></table></body></html>", "tag_len": 74, "cell_len_max": 12, "width": 409, "height": 77, "type": "complex"}, "PMC4297392_007_00.png": {"html": "<html><body><table><thead><tr><td><b>Treatment phase</b></td><td><b>Adverse event</b></td><td><b>No. of patients</b></td></tr></thead><tbody><tr><td rowspan=\"6\">T1</td><td>Swelling</td><td>1</td></tr><tr><td>Itching</td><td>1</td></tr><tr><td>Fever</td><td>4</td></tr><tr><td>Throat infection</td><td>1</td></tr><tr><td>Chest Congestion</td><td>2</td></tr><tr><td>Total</td><td>9</td></tr><tr><td rowspan=\"3\">T2</td><td>Diarrhea</td><td>1</td></tr><tr><td>Body Pain</td><td>1</td></tr><tr><td>Total</td><td>2</td></tr><tr><td rowspan=\"2\">T3</td><td>Diarrhea</td><td>1</td></tr><tr><td>Total</td><td>1</td></tr><tr><td>T4</td><td>Nil</td><td>-</td></tr></tbody></table></body></html>", "tag_len": 98, "cell_len_max": 17, "width": 238, "height": 185, "type": "complex"}, "PMC2094709_004_00.png": {"html": "<html><body><table><thead><tr><td><b>Week</b></td><td><b>Duration (min)</b></td><td><b>Intensity (% HRR)</b></td><td><b>Intensity (RPE)</b></td></tr></thead><tbody><tr><td>1</td><td>20</td><td>50 \u2013 60</td><td>9 \u2013 11</td></tr><tr><td>2</td><td>20</td><td>50 \u2013 60</td><td>9 \u2013 11</td></tr><tr><td>3 \u2013 5</td><td>25</td><td>60 \u2013 70</td><td>11</td></tr><tr><td>6 \u2013 8</td><td>30</td><td>60 \u2013 70</td><td>11</td></tr><tr><td>9 \u2013 11</td><td>30</td><td>70 \u2013 80</td><td>11 \u2013 13</td></tr><tr><td>12 \u2013 14</td><td>35</td><td>70 \u2013 80</td><td>11 \u2013 13</td></tr><tr><td>15 & 16</td><td>40</td><td>75 \u2013 85</td><td>13 \u2013 15</td></tr></tbody></table></body></html>", "tag_len": 84, "cell_len_max": 19, "width": 503, "height": 107, "type": "simple"}, "PMC3568059_003_00.png": {"html": "<html><body><table><thead><tr><td><b> </b></td><td colspan=\"3\"><b>Participants during the period;</b></td></tr><tr><td><b> </b></td><td><b>0 to 3 months</b></td><td><b>3 to 6 months</b></td><td><b>6 to 12 months</b></td></tr><tr><td><b>Characteristics</b></td><td><b>n=72</b></td><td><b>n=71</b></td><td><b>n=65</b></td></tr></thead><tbody><tr><td>Age, years, median (range)</td><td>73 (50\u201394)</td><td>73 (47\u201392)</td><td>73 (47\u201390)</td></tr><tr><td>Patients, n (%)</td><td> </td><td> </td><td> </td></tr><tr><td> Female</td><td>33 (46)</td><td>27 (38)</td><td>26 (40)</td></tr><tr><td> Male</td><td>39 (54)</td><td>44 (62)</td><td>39 (60)</td></tr><tr><td>Stroke classification (TOAST), n (%)</td><td> </td><td> </td><td> </td></tr><tr><td> Large vessel disease</td><td>17 (24)</td><td>18 (25)</td><td>17 (26)</td></tr><tr><td> Small vessel disease</td><td>21 (29)</td><td>21 (30)</td><td>17 (26)</td></tr><tr><td> Cardioembolic stroke</td><td>15 (21)</td><td>11 (15)</td><td>11 (17)</td></tr><tr><td> Cryptogenic stroke</td><td>13 (18)</td><td>14 (20)</td><td>12 (19)</td></tr><tr><td>Intracerebral haemorrhage</td><td>6 (8)</td><td>7 (10)</td><td>8 (12)</td></tr><tr><td>Side of lesion, n (%)</td><td> </td><td> </td><td> </td></tr><tr><td> Right side lesion</td><td>35 (49)</td><td>32 (45)</td><td>28 (43)</td></tr><tr><td> Left side lesion</td><td>37 (51)</td><td>39 (55)</td><td>37 (57)</td></tr><tr><td>Hypertension</td><td>47 (65)</td><td>44 (62)</td><td>41 (63)</td></tr><tr><td>Diabetes mellitus</td><td>17 (24)</td><td>18 (25)</td><td>17 (26)</td></tr><tr><td colspan=\"4\">Results from clinical scales 1\u20137 days after stroke onset</td></tr><tr><td>BBS median (range) (n)</td><td>35 (0\u201356) (n=71)</td><td>41 (0\u201356) (n=70)</td><td>41 (0\u201356) (n=64)</td></tr><tr><td>M-MAS UAS-95 median (range)</td><td>45 (12\u201355) (n=65)</td><td>47 (12\u201355) (n=65)</td><td>50 (16\u201355) (n=59)</td></tr></tbody></table></body></html>", "tag_len": 208, "cell_len_max": 56, "width": 486, "height": 296, "type": "complex"}, "PMC4357206_002_00.png": {"html": "<html><body><table><thead><tr><td></td><td><b>N = 121</b></td></tr></thead><tbody><tr><td>Demographics</td><td></td></tr><tr><td> Age (yr) - median (IQR)</td><td>62 (56-73)</td></tr><tr><td> Female sex (%)</td><td>46 (38)</td></tr><tr><td> White race (%)</td><td>112 (93)</td></tr><tr><td>Comorbidities (%)</td><td></td></tr><tr><td> Hypertension</td><td>64 (53)</td></tr><tr><td> Chronic lung disease</td><td>37 (31)</td></tr><tr><td> Active malignancy</td><td>34 (28)</td></tr><tr><td> Diabetes mellitus</td><td>29 (24)</td></tr><tr><td> Chronic kidney disease</td><td>7 (6)</td></tr><tr><td> Congestive heart failure</td><td>4 (3)</td></tr><tr><td> Chronic liver disease</td><td>2 (2)</td></tr><tr><td>Severity of illness</td><td></td></tr><tr><td> APACHE II score - median (IQR)*</td><td>14 (10-16)</td></tr><tr><td> Charlson Comorbidity Index - median (IQR)<sup>\u2020</sup></td><td>2 (1-4)</td></tr><tr><td>ICU type</td><td></td></tr><tr><td> Surgical</td><td>102 (84)</td></tr><tr><td>SICU</td><td>66 (54)</td></tr><tr><td>TICU</td><td>36 (30)</td></tr><tr><td> Nonsurgical</td><td>19 (16)</td></tr><tr><td>CCU</td><td>11 (9)</td></tr><tr><td>MICU</td><td>8 (7)</td></tr><tr><td>Status of procedure (for surgical patients) (%)</td><td></td></tr><tr><td>Elective</td><td>41 (34)</td></tr><tr><td>Urgent</td><td>57 (47)</td></tr><tr><td>Days in hospital prior to enrollment \u2013 median (IQR)</td><td>1 (1-3)</td></tr></tbody></table></body></html>", "tag_len": 166, "cell_len_max": 51, "width": 238, "height": 381, "type": "simple"}, "PMC4219599_004_00.png": {"html": "<html><body><table><thead><tr><td><b> </b></td><td><b>ORP (n = 9)</b></td><td><b>RALP (n = 24)</b></td><td><b>Total (n = 33)</b></td></tr></thead><tbody><tr><td>Anthropometric data</td><td> </td><td> </td><td> </td></tr><tr><td> Age (yr)</td><td>60 (7)</td><td>63 (6)</td><td>62 (6)</td></tr><tr><td> Height (m)</td><td>1.76 (0.07)</td><td>1.75 (0.05)</td><td>1.75 (0.06)</td></tr><tr><td> Weight (kg)</td><td>92 (12)</td><td>83 (10)</td><td>86 (11)</td></tr><tr><td> BMI (kg.m<sup>-2</sup>)</td><td>29.6 (4.5)</td><td>27.3 (3.0)</td><td>27.9 (3.6)</td></tr><tr><td>Preoperative factors</td><td> </td><td> </td><td> </td></tr><tr><td>PSA (ng/mL)</td><td>5.8 (4.2)</td><td>5.0 (2.1)</td><td>5.2 (2.8)</td></tr><tr><td>Preoperative Gleason score</td><td> </td><td> </td><td> </td></tr><tr><td> 3 + 3</td><td>1 (11%)</td><td>5 (21%)</td><td>6 (18%)</td></tr><tr><td> 3 + 4</td><td>5 (56%)</td><td>16 (67%)</td><td>21 (64%)</td></tr><tr><td> 4 + 3</td><td>3 (33%)</td><td>2 (9%)</td><td>5 (15%)</td></tr><tr><td> 4 + 4</td><td>0 (0%)</td><td>1 (4%)</td><td>1 (3%)</td></tr><tr><td>Clinical tumour stage</td><td> </td><td> </td><td> </td></tr><tr><td> cT1</td><td>4 (44%)</td><td>13 (54%)</td><td>17 (52%)</td></tr><tr><td> cT2</td><td>5 (56%)</td><td>11 (46%)</td><td>16 (48%)</td></tr><tr><td> cT3</td><td>0 (0%)</td><td>0 (0%)</td><td>0 (0%)</td></tr><tr><td> cT4</td><td>0 (0%)</td><td>0 (0%)</td><td>0 (0%)</td></tr><tr><td>Prostate volume (cc)</td><td>40.2 (13.4)</td><td>41.2 (12.5)</td><td>40.9 (12.6)</td></tr><tr><td>Intraoperative factors</td><td> </td><td> </td><td> </td></tr><tr><td>Nerve sparing</td><td> </td><td> </td><td> </td></tr><tr><td> None</td><td>3 (33%)</td><td>3 (13%)</td><td>6 (18%)</td></tr><tr><td> One bundle</td><td>2 (22%)</td><td>2 (9%)</td><td>4 (12%)</td></tr><tr><td> Two bundles</td><td>4 (44%)</td><td>19 (79%)</td><td>23 (70%)</td></tr><tr><td>Pelvic lymph node dissection</td><td>7 (78%)</td><td>2 (9%)<sup>a</sup></td><td>9 (27%)</td></tr><tr><td>Bladder neck preservation</td><td>0 (0%)</td><td>23 (96%)<sup>a</sup></td><td>23 (70%)</td></tr><tr><td>Postoperative factors</td><td> </td><td> </td><td> </td></tr><tr><td>Postoperative Gleason score</td><td> </td><td> </td><td> </td></tr><tr><td> 3 + 3</td><td>1 (11%)</td><td>3 (13%)</td><td>4 (12%)</td></tr><tr><td> 3 + 4</td><td>6 (67%)</td><td>16 (67%)</td><td>22 (67%)</td></tr><tr><td> 4 + 3</td><td>2 (22%)</td><td>5 (21%)</td><td>7 (21%)</td></tr><tr><td> 4 + 4</td><td>0 (0%)</td><td>0 (0%)</td><td>0 (0%)</td></tr><tr><td>Pathological tumour stage</td><td> </td><td> </td><td> </td></tr><tr><td> pT2</td><td>6 (67%)</td><td>18 (75%)</td><td>24 (73%)</td></tr><tr><td> pT3</td><td>3 (33%)</td><td>6 (25%)</td><td>9 (27%)</td></tr><tr><td> pT4</td><td>0 (0%)</td><td>0 (0%)</td><td>0 (0%)</td></tr><tr><td>Positive lymph nodes</td><td>1/7 (14%)</td><td>0/2 (0%)</td><td>1/9 (11%)</td></tr><tr><td>Positive margins</td><td>2 (22%)</td><td>2 (9%)</td><td>4 (12%)</td></tr><tr><td>Duration of postoperative hospital stay (d)</td><td>2.9 (0.3)</td><td>2.0 (0.2)<sup>a</sup></td><td>2.3 (0.5)</td></tr><tr><td>Duration of postoperative catheterization (d)</td><td>10.2 (3.0)</td><td>8.4 (1.6)</td><td>8.9 (2.2)</td></tr><tr><td>Anastomic structure</td><td>0 (0%)</td><td>1 (4%)</td><td>1 (3%)</td></tr></tbody></table></body></html>", "tag_len": 414, "cell_len_max": 45, "width": 486, "height": 577, "type": "simple"}, "PMC3765162_003_01.png": {"html": "<html><body><table><thead><tr><td><b> </b></td><td colspan=\"3\"><b>Men (n = 359)</b></td><td colspan=\"3\"><b>Women (n = 412)</b></td></tr><tr><td><b> </b></td><td colspan=\"3\"><b>Metabolic syndrome</b></td><td colspan=\"3\"><b>Metabolic syndrome</b></td></tr><tr><td><b>Baseline characteristics</b></td><td><b>Yes (n = 163)</b></td><td><b>No (n = 196)</b></td><td><b>P value</b></td><td><b>Yes (n = 96)</b></td><td><b>No (n = 316)</b></td><td><b>P value</b></td></tr></thead><tbody><tr><td><b>Age (years)*</b></td><td>61.86 (\u00b10.83)</td><td>60.32 (\u00b10.77)</td><td>0.17</td><td>64.96 (\u00b10.88)</td><td>58.52 (\u00b10.55)</td><td><0.001</td></tr><tr><td><b>Sitting Systolic BP (mmHg)*</b></td><td>141.34 (\u00b11.27)</td><td>132.26 (\u00b11.15)</td><td><0.001</td><td>151.82 (\u00b11.16)</td><td>137.49 (\u00b10.96)</td><td><0.001</td></tr><tr><td><b>Sitting Diastolic BP (mmHg)*</b></td><td>85.69 (\u00b10.77)</td><td>80.79 (\u00b10.73)</td><td><0.001</td><td>89.27 (\u00b10.92)</td><td>82.67 (\u00b10.51)</td><td><0.001</td></tr><tr><td><b>Antihypertensive Therapy (%)</b></td><td>50.9%</td><td>28.4%</td><td><0.001</td><td>60.4%</td><td>29.4%</td><td><0.001</td></tr><tr><td><b>Total Cholesterol (mmol/L)*</b></td><td>5.61 (\u00b10.08)</td><td>5.70 (\u00b10.08)</td><td>0.56</td><td>6.04 (\u00b10.1)</td><td>5.99 (\u00b10.06)</td><td>0.67</td></tr><tr><td><b>LDL cholesterol (mmol/L)*</b></td><td>3.44 (\u00b10.06)</td><td>3.49 (\u00b10.06)</td><td>0.52</td><td>3.58 (\u00b1 0.06)</td><td>3.54 (\u00b1 0.04)</td><td>0.66</td></tr><tr><td><b>HDL cholesterol (mmol/L)*</b></td><td>1.03 (\u00b10.63)</td><td>1.27 (\u00b10.02)</td><td><0.001</td><td>1.20 (\u00b1 0.02)</td><td>1.48 (\u00b10.016)</td><td><0.001</td></tr><tr><td><b>Triglycerides (mmol/L)\u2020</b></td><td>2.10 (1.63; 2.64)</td><td>1.32 (0.98; 1.57)</td><td><0.001</td><td>2.15 (1.78; 2.83)</td><td>1.24 (0.97; 1.56)</td><td><0.001</td></tr><tr><td><b>Diabetes mellitus (%)</b></td><td>30.7%</td><td>6.3%</td><td><0.001</td><td>33.3%</td><td>2.3%</td><td><0.001</td></tr><tr><td><b>BMI (kg/m</b><sup><b>2</b></sup><b>)*</b></td><td>29.88 (\u00b10.35)</td><td>26.06 (\u00b10.2)</td><td><0.001</td><td>32.39 (\u00b10.47)</td><td>26.95 (\u00b10.25)</td><td><0.001</td></tr><tr><td><b>ApoA1 (g/L)*</b></td><td>1.29 (\u00b10.013)</td><td>1.40 (\u00b10.017)</td><td><0.001</td><td>1.44 (\u00b10.02)</td><td>1.55 (\u00b10.001)</td><td><0.001</td></tr><tr><td><b>ApoB (g/L)*</b></td><td>1.21 (\u00b10.02)</td><td>1.19 (\u00b10.02)</td><td>0.48</td><td>1.23 (\u00b10.02)</td><td>1.18 (\u00b10.014)</td><td>0.044</td></tr><tr><td><b>Homa index\u2020</b></td><td>2.25(1.15; 4.18)</td><td>0.94(0.51; 1.8)</td><td><0.001</td><td>2.51 (1.67; 3.86)</td><td>1.14 (0.72; 1.7)</td><td><0.001</td></tr><tr><td><b>IMTccMean (mm)*</b></td><td>0.79 (\u00b10.15)</td><td>0.76 (\u00b10.12)</td><td>0.084</td><td>0.77 (\u00b10.16)</td><td>0.69 (\u00b10.13)</td><td><0.001</td></tr><tr><td><b>Sum of total plaque area (mm</b><sup><b>2</b></sup><b>)\u2020</b></td><td>53 (25; 103)</td><td>42 (10;72)</td><td>0.002</td><td>16 (1; 44)</td><td>8 (1;32)</td><td>0.01</td></tr><tr><td><b>Sum of plaque area carotids (mm</b><sup><b>2</b></sup><b>)\u2020</b></td><td>22 (1; 39)</td><td>12 (1; 27.5)</td><td>0.011</td><td>8.75 (1;25.75)</td><td>1 (1; 19)</td><td>0.013</td></tr><tr><td><b>Sum of plaque area femoral (mm</b><sup><b>2</b></sup><b>)\u2020</b></td><td>33(10; 62)</td><td>23(1; 49)</td><td>0.011</td><td>1(1; 17.75)</td><td>1(1; 6)</td><td>0.012</td></tr></tbody></table></body></html>", "tag_len": 316, "cell_len_max": 42, "width": 486, "height": 282, "type": "complex"}, "PMC5303243_003_00.png": {"html": "<html><body><table><thead><tr><td colspan=\"2\"><b>Characteristics</b></td><td><b>Total (<i>N</i> = 613)</b></td><td><b>MSSA(<i>N</i> = 508)</b></td><td><b>MRSA (<i>N</i> = 105)</b></td><td><b>OR (95%CI)</b></td><td><b><i>P</i>-value</b></td></tr></thead><tbody><tr><td colspan=\"2\">Age (years)(median, quartiles)</td><td>72 (66;79)</td><td>75 (67;81)</td><td>72 (65;78)</td><td>N/A</td><td>0.0048</td></tr><tr><td rowspan=\"2\">Gender:</td><td>Female</td><td>322 (100.0)</td><td>214 (82.3)</td><td>57 (17.7)</td><td rowspan=\"2\">1.4 (0.93\u20132.16)</td><td rowspan=\"2\">0.5909</td></tr><tr><td>Male</td><td>291 (100.0)</td><td>255 (83.5)</td><td>48 (16.5)</td></tr><tr><td colspan=\"6\">Step aging <i>n</i> (%)</td><td rowspan=\"4\">0,0849</td></tr><tr><td colspan=\"2\"> Young Old</td><td>311 (100.0)</td><td>267 (85.9)</td><td>44 (14.1)</td><td>1.5 (1.00\u20132.35)</td></tr><tr><td colspan=\"2\"> Old Old</td><td>272 (100.0)</td><td>219 (80.5)</td><td>53 (19.5)</td><td>0.7 (0.49\u20131.13)</td></tr><tr><td colspan=\"2\"> Longevity</td><td>30 (100.0)</td><td>22 (73.3)</td><td>8 (26.7)</td><td>0.6 (0.24\u20131.27)</td></tr><tr><td colspan=\"6\">Disease <i>n</i> (%)</td><td rowspan=\"6\"><0.0001</td></tr><tr><td colspan=\"2\"> PNU</td><td>47 (100.0)</td><td>28 (59.6)</td><td>19 (40.4)</td><td>0.3 (0.14\u20130.49)</td></tr><tr><td colspan=\"2\"> BSI</td><td>37 (100.0)</td><td>27 (73.0)</td><td>10 (27.0)</td><td>0.5 (0.25\u20131.14)</td></tr><tr><td colspan=\"2\"> SSTI</td><td>416 (100.0)</td><td>350 (84.1)</td><td>66 (15.9)</td><td>1.3 (0.85\u20132.03)</td></tr><tr><td colspan=\"2\"> EI</td><td>62 (100.0)</td><td>56 (90.3)</td><td>6 (9.7)</td><td>1.7 (0.72\u20134.06)</td></tr><tr><td colspan=\"2\"> Others</td><td>51 (100.0)</td><td>47 (92.2)</td><td>4 (7.8)</td><td>2.6 (0.91\u20137.31)</td></tr><tr><td colspan=\"6\">Place of the treatment infections <i>n</i> (%)</td><td rowspan=\"4\">0.0033</td></tr><tr><td colspan=\"2\"> INPATIENTS</td><td>430 (100.0)</td><td>352 (81.4)</td><td>78 (18.1)</td><td>0.8 (0.49\u20131.26)</td></tr><tr><td colspan=\"2\"> LTCF</td><td>16 (100.0)</td><td>9 (56.3)</td><td>7 (43.8)</td><td>0.3 (0.09\u20130.69)</td></tr><tr><td colspan=\"2\"> OUTPATIENTS</td><td>167 (100.0)</td><td>147 (88.0)</td><td>20 (12.0)</td><td>1.7 (1.03\u20132.92)</td></tr><tr><td colspan=\"7\">Infections treated in hospitals (INPATIENTS <i>N</i> = 430, <i>n</i> (%))</td></tr><tr><td colspan=\"2\"> ICU</td><td>19 (100.0)</td><td>12 (63.2)</td><td>7 (36.8)</td><td rowspan=\"2\">2.8 (1.06\u20137.34)</td><td rowspan=\"2\">0.014</td></tr><tr><td colspan=\"2\"> non-ICU</td><td>411 (100.0)</td><td>340 (82.7)</td><td>71 (17.3)</td></tr></tbody></table></body></html>", "tag_len": 290, "cell_len_max": 63, "width": 486, "height": 316, "type": "complex"}, "PMC4969833_016_01.png": {"html": "<html><body><table><thead><tr><td></td><td><b>Horizontal</b></td><td><b>Normal</b></td><td><b>Vertical</b></td><td><b>Total Object</b></td></tr></thead><tbody><tr><td>Horizontal</td><td>38</td><td>3</td><td>5</td><td>46 (83%)</td></tr><tr><td>Normal</td><td>1</td><td>54</td><td>7</td><td>62 (87%)</td></tr><tr><td>Vertical</td><td>2</td><td>21</td><td>1140</td><td>1163 (98%)</td></tr></tbody></table></body></html>", "tag_len": 52, "cell_len_max": 14, "width": 264, "height": 58, "type": "simple"}}


--------------------------------------------------------------------------------
/mutab/metrics/sample_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "PMC2094709_004_00.png": 1.0,
 3 |   "PMC2871264_002_00.png": 1.0,
 4 |   "PMC2915972_003_00.png": 0.9298260149130074,
 5 |   "PMC3160368_005_00.png": 0.994615695248351,
 6 |   "PMC3568059_003_00.png": 0.9609420535891124,
 7 |   "PMC3707453_006_00.png": 0.8538903625110521,
 8 |   "PMC3765162_003_01.png": 0.9867342100509474,
 9 |   "PMC3872294_001_00.png": 0.9863636363636363,
10 |   "PMC4196076_004_00.png": 0.9958653089334908,
11 |   "PMC4219599_004_00.png": 0.6029978075326913,
12 |   "PMC4297392_007_00.png": 0.8070175438596492,
13 |   "PMC4311460_007_00.png": 0.6576923076923077,
14 |   "PMC4357206_002_00.png": 0.9295181638546892,
15 |   "PMC4445578_009_01.png": 0.6754965084868096,
16 |   "PMC4969833_016_01.png": 1.0,
17 |   "PMC5303243_003_00.png": 0.6494374120956399,
18 |   "PMC5451934_004_00.png": 0.9978213507625272,
19 |   "PMC5755158_010_01.png": 1.0,
20 |   "PMC5849724_006_00.png": 0.9653439200120101,
21 |   "PMC6022086_007_00.png": 1.0
22 | }
23 | 


--------------------------------------------------------------------------------
/mutab/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .backbone import TableResNet
 2 | from .decoder import TableDecoder
 3 | from .encoder import PositionalEncoding2D
 4 | from .factory import build_detector
 5 | from .handler import TableHandler
 6 | from .loss import BBLoss, CELoss, KLLoss
 7 | from .scanner import TableScanner
 8 | 
 9 | __all__ = [
10 |     "BBLoss",
11 |     "CELoss",
12 |     "KLLoss",
13 |     "PositionalEncoding2D",
14 |     "TableDecoder",
15 |     "TableHandler",
16 |     "TableResNet",
17 |     "TableScanner",
18 |     "build_detector",
19 | ]
20 | 


--------------------------------------------------------------------------------
/mutab/models/backbone.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, List, Mapping
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from mutab.models.factory import BACKBONES, GC_MODULES
  8 | from mutab.models.factory import build_gc_module as build
  9 | 
 10 | 
 11 | class BN(nn.BatchNorm2d):
 12 |     def __init__(self, d: int, mom=0.1):
 13 |         super().__init__(d, momentum=mom)
 14 | 
 15 | 
 16 | class Conv(nn.Conv2d):
 17 |     def __init__(self, d: int, h: int, k: int):
 18 |         super().__init__(d, h, k, padding=k // 2, bias=False)
 19 | 
 20 | 
 21 | class ConvBn(nn.Sequential):
 22 |     def __init__(self, d: int, h: int, k: int, mom=0.1):
 23 |         super().__init__(Conv(d, h, k), BN(h, mom=mom))
 24 | 
 25 | 
 26 | class ConvBnReLU(nn.Sequential):
 27 |     def __init__(self, d: int, h: int, k: int, mom=0.1):
 28 |         super().__init__(ConvBn(d, h, k, mom=mom), nn.ReLU())
 29 | 
 30 | 
 31 | @GC_MODULES.register_module()
 32 | class GCA(nn.Module):
 33 |     def __init__(self, d: int, ratio: float, heads: int):
 34 |         super().__init__()
 35 |         neck = int(ratio * d)
 36 |         assert d % heads == 0
 37 |         self.size = d // heads
 38 |         self.prob = nn.Softmax(dim=2)
 39 |         self.mask = nn.Conv2d(self.size, 1, 1)
 40 |         self.norm = nn.LayerNorm([neck, 1, 1])
 41 |         self.c1 = nn.Conv2d(d, neck, 1)
 42 |         self.c2 = nn.Conv2d(neck, d, 1)
 43 | 
 44 |     def forward(self, x):
 45 |         n, c, h, w = x.size()
 46 |         mask = self.mask(x.reshape(-1, self.size, h, w))
 47 |         mask = self.prob(mask.flatten(-2).unsqueeze(-1))
 48 |         y = x.reshape(-1, self.size, h * w).unsqueeze(1)
 49 |         y = torch.matmul(y, mask).reshape(n, c, 1, 1)
 50 |         return self.c2(F.relu(self.norm(self.c1(y)))).add(x)
 51 | 
 52 | 
 53 | class ResidualBlock(nn.Module):
 54 |     def __init__(self, d: int, h: int, gca: List[str] = [], **gcb):
 55 |         super().__init__()
 56 |         self.cv1 = nn.Sequential()
 57 |         self.cv1.append(ConvBn(d, h, 3, mom=0.9))
 58 |         self.cv1.append(nn.ReLU())
 59 |         self.cv1.append(ConvBn(h, h, 3, mom=0.9))
 60 |         self.cv1.extend(build(gcb, type=gc, d=h) for gc in gca)
 61 |         self.cv2 = ConvBn(d, h, 1) if d != h else nn.Identity()
 62 | 
 63 |     def forward(self, x):
 64 |         return F.relu(self.cv2(x).add(self.cv1(x)))
 65 | 
 66 | 
 67 | class ResidualGroup(nn.Sequential):
 68 |     def __init__(self, d: int, h: int, depth: int, **gcb):
 69 |         super().__init__()
 70 |         self.append(ResidualBlock(d, h, **gcb))
 71 |         self.extend(ResidualBlock(h, h) for _ in range(1, depth))
 72 | 
 73 | 
 74 | @BACKBONES.register_module()
 75 | class TableResNet(nn.Sequential):
 76 |     def __init__(
 77 |         self,
 78 |         dim: int,
 79 |         out: int,
 80 |         gcb1: Mapping[str, Any],
 81 |         gcb2: Mapping[str, Any],
 82 |         gcb3: Mapping[str, Any],
 83 |         gcb4: Mapping[str, Any],
 84 |     ):
 85 |         super().__init__()
 86 | 
 87 |         ch1 = out // 8
 88 |         ch2 = out // 4
 89 |         ch3 = out // 2
 90 | 
 91 |         # group1
 92 |         self.append(ConvBnReLU(dim, ch1, 3))
 93 |         self.append(ConvBnReLU(ch1, ch2, 3))
 94 | 
 95 |         # group2
 96 |         self.append(nn.MaxPool2d(2, ceil_mode=True))
 97 |         self.append(ResidualGroup(ch2, ch3, **gcb1))
 98 | 
 99 |         # group3
100 |         self.append(nn.MaxPool2d(2, ceil_mode=True))
101 |         self.append(ResidualGroup(ch3, ch3, **gcb2))
102 | 
103 |         # group4
104 |         self.append(nn.MaxPool2d(2, ceil_mode=True))
105 |         self.append(ResidualGroup(ch3, out, **gcb3))
106 |         self.append(ResidualGroup(out, out, **gcb4))
107 | 


--------------------------------------------------------------------------------
/mutab/models/decoder.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import math
  3 | from functools import partial
  4 | from typing import List
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | from positional_encodings import torch_encodings as pos
 10 | from rotary_embedding_torch import RotaryEmbedding
 11 | 
 12 | from mutab.models.factory import ATTENTIONS, DECODERS, build_attention
 13 | 
 14 | 
 15 | class PositionalEncodingAdd(pos.PositionalEncoding1D):
 16 |     def forward(self, x):
 17 |         return super().forward(x).add(x)
 18 | 
 19 | 
 20 | class Mask(nn.Module):
 21 |     def forward(self, x, mask):
 22 |         return x.where(mask, torch.finfo(x.dtype).min)
 23 | 
 24 | 
 25 | class Linear(nn.Sequential):
 26 |     def __init__(self, d: int, h: int, *, act=nn.Identity):
 27 |         super().__init__(nn.LayerNorm(d), nn.Linear(d, h), act())
 28 | 
 29 | 
 30 | class Attention(nn.Module, abc.ABC):
 31 |     def __init__(self, heads: int, d_model: int, **kwargs):
 32 |         super().__init__()
 33 |         assert d_model % heads == 0
 34 |         self.dim = int(d_model // heads)
 35 |         self.lhd = (-1, heads, self.dim)
 36 |         self.q = Linear(d_model, d_model)
 37 |         self.k = Linear(d_model, d_model)
 38 |         self.v = Linear(d_model, d_model)
 39 |         self.w = Linear(d_model, d_model)
 40 | 
 41 |     def forward(self, q, k, v, **kwargs):
 42 |         q = self.q(q).view(len(q), *self.lhd).transpose(1, 2)
 43 |         k = self.k(k).view(len(k), *self.lhd).transpose(1, 2)
 44 |         v = self.v(v).view(len(v), *self.lhd).transpose(1, 2)
 45 |         x = self.attention(q, k, v, **kwargs).transpose(1, 2)
 46 |         return self.w(x.contiguous().flatten(start_dim=2))
 47 | 
 48 |     @property
 49 |     @abc.abstractmethod
 50 |     def causal(self) -> bool:
 51 |         raise NotImplementedError
 52 | 
 53 |     @abc.abstractmethod
 54 |     def attention(self, q, k, v, **kwargs):
 55 |         raise NotImplementedError
 56 | 
 57 | 
 58 | @ATTENTIONS.register_module()
 59 | class GlobalAttention(Attention):
 60 |     def __init__(self, dropout: float, **kwargs):
 61 |         super().__init__(**kwargs)
 62 |         self.drop = nn.Dropout(dropout)
 63 |         self.mask = Mask()
 64 | 
 65 |     @property
 66 |     def causal(self):
 67 |         return False
 68 | 
 69 |     def attention(self, q, k, v, mask=None, **kwargs):
 70 |         p = q.matmul(k.mT.div(math.sqrt(v.size(-1))))
 71 |         p = p if mask is None else self.mask(p, mask)
 72 |         return self.drop(p.softmax(dim=-1)).matmul(v)
 73 | 
 74 | 
 75 | @ATTENTIONS.register_module()
 76 | class WindowAttention(GlobalAttention):
 77 |     def __init__(self, window: int, **kwargs):
 78 |         super().__init__(**kwargs)
 79 |         self.rotary = RotaryEmbedding(self.dim)
 80 |         self.window = window
 81 | 
 82 |     @property
 83 |     def causal(self):
 84 |         return True
 85 | 
 86 |     def attention(self, q, k, v, **kwargs):
 87 |         # buckets
 88 |         bq = self.bucket(q)
 89 |         bk = self.unfold(self.bucket(k))
 90 |         bv = self.unfold(self.bucket(v))
 91 | 
 92 |         # indices
 93 |         n = int(bq.shape[-3:-1].numel())
 94 |         i = torch.arange(n).to(q.device)
 95 |         i = self.bucket(i.unsqueeze(-1))
 96 |         j = self.unfold(i).mT
 97 | 
 98 |         # masking
 99 |         mask = i.ge(j).logical_and(j.ne(-1))
100 | 
101 |         # rotary embedding
102 |         bq = self.rotary.rotate_queries_or_keys(bq)
103 |         bk = self.rotary.rotate_queries_or_keys(bk)
104 | 
105 |         # global attention
106 |         out = super().attention(q=bq, k=bk, v=bv, mask=mask)
107 |         return out.flatten(-3, -2).narrow(-2, 0, q.size(-2))
108 | 
109 |     def bucket(self, x):
110 |         n = self.window * math.ceil(x.size(-2) / self.window)
111 |         x = F.pad(x, pad=(0, 0, 0, n - x.size(-2)), value=-1)
112 |         x = torch.stack(x.split(self.window, dim=-2), dim=-3)
113 |         return x
114 | 
115 |     def unfold(self, x):
116 |         pad = F.pad(x, pad=(0, 0, 0, 0, 1, 0), value=-1)
117 |         pad = pad.narrow(-3, start=0, length=x.size(-3))
118 |         return torch.cat([pad, x], dim=-2)
119 | 
120 | 
121 | @ATTENTIONS.register_module()
122 | class AbsentAttention(nn.Module):
123 |     def __init__(self, **kwargs):
124 |         super().__init__()
125 | 
126 |     def forward(self, q, k, v, **kwargs):
127 |         return torch.zeros_like(q)
128 | 
129 | 
130 | class FeedForward(nn.Sequential):
131 |     def __init__(self, d_model: int, **kwargs):
132 |         super().__init__()
133 |         self.append(Linear(d_model, d_model, act=nn.ReLU))
134 |         self.append(Linear(d_model, d_model, act=nn.Identity))
135 | 
136 | 
137 | class Block(nn.Module):
138 |     def __init__(self, att1, att2, **kwargs):
139 |         super().__init__()
140 |         self.att1 = build_attention(att1, **kwargs)
141 |         self.att2 = build_attention(att2, **kwargs)
142 |         self.feed = FeedForward(**kwargs)
143 | 
144 |     def forward(self, kwargs):
145 |         kwargs.update(**self.perform(**kwargs))
146 |         return kwargs
147 | 
148 |     def perform(self, x, y, mask=None, **kwargs):
149 |         x = x.add(self.att1(x, x, x, mask=mask))
150 |         x = x.add(self.att2(x, y, y, mask=None))
151 |         x = x.add(self.feed(x))
152 |         return dict(x=x)
153 | 
154 | 
155 | class Blocks(nn.Sequential):
156 |     def __init__(self, blocks, **kwargs):
157 |         block = lambda args: Block(**args, **kwargs)
158 |         super().__init__(*tuple(map(block, blocks)))
159 | 
160 |     def forward(self, **kwargs):
161 |         return super().forward(kwargs).get("x")
162 | 
163 | 
164 | class Fetcher(nn.Module):
165 |     def __init__(self, SOC: int, EOS: int, **kwargs):
166 |         super().__init__()
167 | 
168 |         # special tokens
169 |         self.register_buffer("SOC", torch.tensor(SOC))
170 |         self.register_buffer("EOS", torch.tensor(EOS))
171 | 
172 |     def extract(self, x, mask, size):
173 |         return F.pad(x[mask], pad=(0, 0, 0, size - sum(mask)))
174 | 
175 |     def forward(self, img, hid, seq):
176 |         assert hid.ndim == 3
177 |         assert seq.ndim == 2
178 | 
179 |         # masking
180 |         soc = torch.isin(seq, self.SOC).unsqueeze(2)
181 |         eos = torch.isin(seq, self.EOS).unsqueeze(2)
182 | 
183 |         # padding
184 |         soc = soc.logical_and(eos.cumsum(dim=1).logical_not())
185 |         pad = partial(self.extract, size=soc.sum(dim=1).max())
186 | 
187 |         # extract
188 |         ext = torch.stack(list(map(pad, hid, soc.squeeze(2))))
189 | 
190 |         return hid, ext
191 | 
192 | 
193 | class Decoder(nn.Module):
194 |     def __init__(
195 |         self,
196 |         d_input: int,
197 |         d_model: int,
198 |         num_emb: int,
199 |         max_len: int,
200 |         SOS: int,
201 |         EOS: int,
202 |         SEP: int,
203 |         **kwargs,
204 |     ):
205 |         super().__init__()
206 | 
207 |         # special tokens
208 |         self.register_buffer("SOS", torch.tensor(SOS))
209 |         self.register_buffer("EOS", torch.tensor(EOS))
210 |         self.register_buffer("SEP", torch.tensor(SEP))
211 | 
212 |         # embedding
213 |         self.emb = nn.Embedding(num_emb, d_model)
214 |         self.pos = PositionalEncodingAdd(d_model)
215 | 
216 |         # blocks
217 |         self.dec = Blocks(d_model=d_model, **kwargs)
218 |         self.cat = Linear(d_input, d_model)
219 |         self.out = Linear(d_model, num_emb)
220 | 
221 |         # prediction length
222 |         self.max_len = max_len
223 | 
224 |     def predict(self, img, aux):
225 |         seq = self.SOS.expand(len(img), 1)
226 |         eos = self.EOS.expand(len(img), 1)
227 |         for _ in range(self.max_len + 1):
228 |             h, out = self(img, seq, aux, argmax=True)
229 |             seq = torch.cat([seq[:, :1], out], dim=1)
230 |             end = seq.eq(eos).sum(dim=1).bool().sum()
231 |             if end.item() == len(img):
232 |                 break
233 | 
234 |         return h, out
235 | 
236 |     def forward(self, img, seq, aux, argmax=False):
237 |         # alignment
238 |         idx = torch.eq(seq, self.SEP).cumsum(dim=1).unsqueeze(-1)
239 |         mat = torch.zeros(*seq.shape, aux.size(1)).to(aux.device)
240 |         mat = mat.scatter_(-1, idx.clip_(max=aux.size(1) - 1), 1)
241 |         mix = torch.cat([self.emb(seq), mat.matmul(aux)], dim=-1)
242 | 
243 |         # prediction
244 |         hid = self.dec(x=self.pos(self.cat(mix)), y=img, mask=None)
245 |         out = self.out(hid).argmax(-1) if argmax else self.out(hid)
246 | 
247 |         return hid, out
248 | 
249 | 
250 | @DECODERS.register_module()
251 | class TableDecoder(nn.Module):
252 |     def __init__(
253 |         self,
254 |         d_model: int,
255 |         html_decoder,
256 |         cell_decoder,
257 |         html_fetcher,
258 |         num_emb_html: int,
259 |         num_emb_cell: int,
260 |         max_len_html: int,
261 |         max_len_cell: int,
262 |         SOC_HTML: List[int],
263 |         SOS_HTML: int,
264 |         EOS_HTML: int,
265 |         SOS_CELL: int,
266 |         EOS_CELL: int,
267 |         SEP_CELL: int,
268 |         **kwargs,
269 |     ):
270 |         super().__init__()
271 | 
272 |         # parameters
273 |         html_decoder.update(d_model=d_model)
274 |         cell_decoder.update(d_model=d_model)
275 | 
276 |         # alphabet
277 |         html_decoder.update(num_emb=num_emb_html)
278 |         cell_decoder.update(num_emb=num_emb_cell)
279 | 
280 |         # capacity
281 |         html_decoder.update(max_len=max_len_html)
282 |         cell_decoder.update(max_len=max_len_cell)
283 | 
284 |         # special tokens
285 |         html_decoder.update(SOS=SOS_HTML)
286 |         html_decoder.update(EOS=EOS_HTML)
287 |         html_decoder.update(SEP=EOS_HTML)
288 | 
289 |         cell_decoder.update(SOS=SOS_CELL)
290 |         cell_decoder.update(EOS=EOS_CELL)
291 |         cell_decoder.update(SEP=SEP_CELL)
292 | 
293 |         html_fetcher.update(SOC=SOC_HTML)
294 |         html_fetcher.update(EOS=EOS_HTML)
295 | 
296 |         # input channels
297 |         html_decoder.update(d_input=d_model + 2)
298 |         cell_decoder.update(d_input=d_model * 2)
299 | 
300 |         # other parameters
301 |         html_decoder.update(**kwargs)
302 |         cell_decoder.update(**kwargs)
303 | 
304 |         # en/decoders
305 |         self.html = Decoder(**html_decoder)
306 |         self.cell = Decoder(**cell_decoder)
307 |         self.grid = Fetcher(**html_fetcher)
308 | 
309 |         # bbox
310 |         self.bbox = Linear(d_model, 4, act=nn.Sigmoid)
311 | 
312 |         # LtoR or RtoL
313 |         self.register_buffer("LtoR", torch.eye(2)[0])
314 |         self.register_buffer("RtoL", torch.eye(2)[1])
315 | 
316 |     def forward(self, img, html, back, cell, **kwargs):
317 |         # ground truth
318 |         html = html.to(img.device)
319 |         back = back.to(img.device)
320 |         cell = cell.to(img.device)
321 | 
322 |         # remove [EOS]
323 |         s_html = html[:, :-1]
324 |         e_back = back[:, :-1]
325 |         s_cell = cell[:, :-1]
326 | 
327 |         # remove [SOS]
328 |         e_html = html[:, 1::]
329 | 
330 |         # LtoR or RtoL
331 |         h_LtoR = self.LtoR.expand(len(img), 1, 2)
332 |         h_RtoL = self.RtoL.expand(len(img), 1, 2)
333 | 
334 |         # structure prediction
335 |         h_html, o_html = self.html(img, s_html, h_LtoR)
336 |         h_back, o_back = self.html(img, e_back, h_RtoL)
337 | 
338 |         # character prediction
339 |         h_html, h_grid = self.grid(img, h_html, e_html)
340 |         h_cell, o_cell = self.cell(img, s_cell, h_grid)
341 | 
342 |         return dict(
343 |             html=o_html,
344 |             back=o_back,
345 |             cell=o_cell,
346 |             bbox=self.bbox(h_html),
347 |         )
348 | 
349 |     def predict(self, img):
350 |         # LtoR
351 |         h_LtoR = self.LtoR.expand(len(img), 1, 2)
352 | 
353 |         # structure prediction
354 |         h_html, o_html = self.html.predict(img, h_LtoR)
355 | 
356 |         # character prediction
357 |         h_html, h_grid = self.grid(img, h_html, o_html)
358 |         h_cell, o_cell = self.cell.predict(img, h_grid)
359 | 
360 |         return dict(html=o_html, cell=o_cell, bbox=self.bbox(h_html))
361 | 


--------------------------------------------------------------------------------
/mutab/models/encoder.py:
--------------------------------------------------------------------------------
 1 | from positional_encodings import torch_encodings as pos
 2 | 
 3 | from mutab.models.factory import ENCODERS
 4 | 
 5 | 
 6 | @ENCODERS.register_module()
 7 | class PositionalEncoding2D(pos.PositionalEncodingPermute2D):
 8 |     def forward(self, img):
 9 |         return super().forward(img).add(img).flatten(2).mT
10 | 


--------------------------------------------------------------------------------
/mutab/models/factory.py:
--------------------------------------------------------------------------------
 1 | from mmcv.utils import Registry, build_from_cfg
 2 | from mmdet.models.builder import BACKBONES, DETECTORS, LOSSES
 3 | 
 4 | HANDLERS = Registry("handler")
 5 | ENCODERS = Registry("encoder")
 6 | DECODERS = Registry("decoder")
 7 | ATTENTIONS = Registry("attentions")
 8 | GC_MODULES = Registry("gc-modules")
 9 | 
10 | 
11 | def build_from_dict(cfg, registry, **kwargs):
12 |     return build_from_cfg(dict(**cfg, **kwargs), registry)
13 | 
14 | 
15 | def build_detector(cfg, **kwargs):
16 |     return build_from_dict(cfg, DETECTORS, **kwargs)
17 | 
18 | 
19 | def build_backbone(cfg, **kwargs):
20 |     return build_from_dict(cfg, BACKBONES, **kwargs)
21 | 
22 | 
23 | def build_encoder(cfg, **kwargs):
24 |     return build_from_dict(cfg, ENCODERS, **kwargs)
25 | 
26 | 
27 | def build_decoder(cfg, **kwargs):
28 |     return build_from_dict(cfg, DECODERS, **kwargs)
29 | 
30 | 
31 | def build_handler(cfg, **kwargs):
32 |     return build_from_dict(cfg, HANDLERS, **kwargs)
33 | 
34 | 
35 | def build_loss(cfg, **kwargs):
36 |     return build_from_dict(cfg, LOSSES, **kwargs)
37 | 
38 | 
39 | def build_gc_module(cfg, **kwargs):
40 |     return build_from_dict(cfg, GC_MODULES, **kwargs)
41 | 
42 | 
43 | def build_attention(cfg, **kwargs):
44 |     return build_from_dict(cfg, ATTENTIONS, **kwargs)
45 | 


--------------------------------------------------------------------------------
/mutab/models/handler.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from functools import cached_property
  3 | from itertools import product
  4 | from typing import Dict, List
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | from more_itertools import flatten, split_at
 10 | 
 11 | from mutab.models.factory import HANDLERS
 12 | from mutab.models.revisor import Revisor
 13 | 
 14 | 
 15 | @HANDLERS.register_module()
 16 | class TableHandler(nn.Module):
 17 |     def __init__(
 18 |         self,
 19 |         html_dict_file: str,
 20 |         cell_dict_file: str,
 21 |         SOC: List[str],
 22 |         EOC: List[str],
 23 |         revisor: Dict[str, str],
 24 |     ):
 25 |         super().__init__()
 26 | 
 27 |         assert isinstance(html_dict_file, str)
 28 |         assert isinstance(cell_dict_file, str)
 29 | 
 30 |         assert isinstance(SOC, list)
 31 |         assert isinstance(EOC, list)
 32 | 
 33 |         self.SOC = SOC
 34 | 
 35 |         self.char2idx_html, self.idx2char_html = self.load(html_dict_file)
 36 |         self.char2idx_cell, self.idx2char_cell = self.load(cell_dict_file)
 37 | 
 38 |         self.SOS_HTML = self.add(self.char2idx_html, self.idx2char_html, "<SOS>")
 39 |         self.EOS_HTML = self.add(self.char2idx_html, self.idx2char_html, "<EOS>")
 40 |         self.PAD_HTML = self.add(self.char2idx_html, self.idx2char_html, "<PAD>")
 41 |         self.UKN_HTML = self.add(self.char2idx_html, self.idx2char_html, "<UKN>")
 42 | 
 43 |         self.SOS_CELL = self.add(self.char2idx_cell, self.idx2char_cell, "<SOS>")
 44 |         self.EOS_CELL = self.add(self.char2idx_cell, self.idx2char_cell, "<EOS>")
 45 |         self.PAD_CELL = self.add(self.char2idx_cell, self.idx2char_cell, "<PAD>")
 46 |         self.SEP_CELL = self.add(self.char2idx_cell, self.idx2char_cell, "<SEP>")
 47 |         self.UKN_CELL = self.add(self.char2idx_cell, self.idx2char_cell, "<UKN>")
 48 | 
 49 |         assert len(self.char2idx_html) == len(self.idx2char_html)
 50 |         assert len(self.char2idx_cell) == len(self.idx2char_cell)
 51 | 
 52 |         self.char2idx_html = defaultdict(lambda: self.UKN_HTML, self.char2idx_html)
 53 |         self.char2idx_cell = defaultdict(lambda: self.UKN_CELL, self.char2idx_cell)
 54 | 
 55 |         self.revisor = Revisor(**revisor, SOC=SOC, EOC=EOC)
 56 | 
 57 |     def load(self, dict_file: str, enc="utf-8"):
 58 |         with open(dict_file, encoding=enc) as f:
 59 |             idx2char = list(filter(None, f.read().splitlines()))
 60 |             char2idx = dict(zip(idx2char, range(len(idx2char))))
 61 |         return char2idx, idx2char
 62 | 
 63 |     def add(self, char2idx, idx2char, token: str):
 64 |         idx = len(idx2char)
 65 |         idx2char.append(token)
 66 |         char2idx[token] = idx
 67 |         return idx
 68 | 
 69 |     @property
 70 |     def num_class_html(self):
 71 |         return len(self.idx2char_html)
 72 | 
 73 |     @property
 74 |     def num_class_cell(self):
 75 |         return len(self.idx2char_cell)
 76 | 
 77 |     @cached_property
 78 |     def SOC_HTML(self):
 79 |         return list(self.char2idx_html[v] for v in self.SOC)
 80 | 
 81 |     def str2idx(self, strings, char2idx):
 82 |         return list([char2idx[v] for v in string] for string in strings)
 83 | 
 84 |     def idx2str(self, indices, idx2char, join=lambda tokens: tokens):
 85 |         return list(join([idx2char[i] for i in idx]) for idx in indices)
 86 | 
 87 |     def pad_tensor(self, batch, value):
 88 |         pad = lambda seq, size: F.pad(seq, (0, size - len(seq)), value=value)
 89 |         return torch.stack([pad(seq, max(map(len, batch))) for seq in batch])
 90 | 
 91 |     def encode_html(self, batch):
 92 |         samples = []
 93 |         for idx in self.str2idx(batch, self.char2idx_html):
 94 |             idx = (self.SOS_HTML, *idx, self.EOS_HTML)
 95 |             samples.append(torch.tensor(idx))
 96 |         return self.pad_tensor(samples, self.PAD_HTML)
 97 | 
 98 |     def encode_cell(self, batch):
 99 |         samples = []
100 |         sos = self.SOS_CELL
101 |         eos = self.EOS_CELL
102 |         sep = self.SEP_CELL
103 |         for sample in batch:
104 |             item = self.str2idx(sample, self.char2idx_cell)
105 |             item = flatten(flatten(product(item, [[sep]])))
106 |             samples.append(torch.tensor([sos, *item, eos]))
107 |         return self.pad_tensor(samples, self.PAD_CELL)
108 | 
109 |     def decode_html(self, batch):
110 |         strip = lambda it: next(split_at(it, lambda n: n == self.EOS_HTML))
111 |         return self.idx2str(map(strip, batch.tolist()), self.idx2char_html)
112 | 
113 |     def decode_cell(self, batch):
114 |         strings = []
115 |         for idx in batch.tolist():
116 |             idx = next(split_at(idx, lambda n: n == self.EOS_CELL))
117 |             idx = list(split_at(idx, lambda n: n == self.SEP_CELL))
118 |             strings.append(self.idx2str(idx, self.idx2char_cell, "".join))
119 |         return strings
120 | 
121 |     def encode_bbox(self, batch):
122 |         pad = lambda bb, k: F.pad(torch.from_numpy(bb), (0, 0, 1, k - len(bb)))
123 |         return torch.stack([pad(bb, 1 + max(map(len, batch))) for bb in batch])
124 | 
125 |     def decode_bbox(self, batch, mask, img_metas):
126 |         results = []
127 |         for bbox, mask, meta in zip(batch, mask, img_metas):
128 |             bbox = bbox.cpu().numpy()
129 |             mask = mask.cpu().numpy()
130 |             scale = meta["img_scale"]
131 |             shape = meta["pad_shape"]
132 |             bbox[:, 0::2] *= shape[1]
133 |             bbox[:, 1::2] *= shape[0]
134 |             bbox[:, 0::2] /= scale[1]
135 |             bbox[:, 1::2] /= scale[0]
136 |             results.append(bbox[mask])
137 |         return results
138 | 
139 |     def item(self, html, cell, bbox, img_meta):
140 |         results = dict(real=self.revisor(**img_meta) if "html" in img_meta else None)
141 |         results.update(html=html, cell=cell, bbox=bbox, pred=self.revisor(html, cell))
142 |         return results
143 | 
144 |     def forward(self, img_metas):
145 |         html = self.encode_html([m["html"] for m in img_metas])
146 |         cell = self.encode_cell([m["cell"] for m in img_metas])
147 |         bbox = self.encode_bbox([m["bbox"] for m in img_metas])
148 |         return dict(html=html, back=html.fliplr(), cell=cell, bbox=bbox)
149 | 
150 |     def reverse(self, html, cell, bbox, img_metas, **kwargs):
151 |         mask = torch.isin(html, torch.tensor(self.SOC_HTML).to(html))
152 |         bbox = self.decode_bbox(bbox, mask=mask, img_metas=img_metas)
153 |         html = self.decode_html(html)
154 |         cell = self.decode_cell(cell)
155 |         return tuple(map(self.item, html, cell, bbox, img_metas))
156 | 


--------------------------------------------------------------------------------
/mutab/models/loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from mutab.models.factory import LOSSES
  5 | 
  6 | 
  7 | @LOSSES.register_module()
  8 | class CELoss(nn.Module):
  9 |     def __init__(self, key: str, ignore_index: int):
 10 |         super().__init__()
 11 | 
 12 |         # keys
 13 |         self.key = key
 14 |         self.label = "loss_ce_{}".format(key)
 15 | 
 16 |         # loss
 17 |         self.loss = self.build_loss(ignore_index)
 18 | 
 19 |     def build_loss(self, ignore_index):
 20 |         return nn.CrossEntropyLoss(ignore_index=ignore_index)
 21 | 
 22 |     def format(self, outputs, targets):
 23 |         # outputs [N, C, L]
 24 |         # targets [N, L]
 25 |         logit = outputs[self.key].mT
 26 |         label = targets[self.key][:, 1:]
 27 |         return logit, label.to(logit.device)
 28 | 
 29 |     def forward(self, outputs, targets, img_metas=None):
 30 |         logit, label = self.format(outputs, targets)
 31 |         return {self.label: self.loss(logit, label)}
 32 | 
 33 | 
 34 | @LOSSES.register_module()
 35 | class KLLoss(nn.Module):
 36 |     def __init__(self, key: str, rev: str, ignore_index: int):
 37 |         super().__init__()
 38 | 
 39 |         # keys
 40 |         self.key = key
 41 |         self.rev = rev
 42 | 
 43 |         # labels
 44 |         self.loss_key = f"loss_kl_{key}"
 45 |         self.loss_rev = f"loss_kl_{rev}"
 46 | 
 47 |         # prob
 48 |         self.sm_p = nn.Softmax(dim=2)
 49 |         self.sm_q = nn.LogSoftmax(dim=2)
 50 | 
 51 |         # loss
 52 |         self.loss = self.build_loss("sum")
 53 | 
 54 |         # <PAD>
 55 |         pad = torch.tensor(ignore_index)
 56 |         self.register_buffer("PAD", pad)
 57 | 
 58 |     def build_loss(self, reduction):
 59 |         return nn.KLDivLoss(reduction=reduction)
 60 | 
 61 |     def format(self, outputs, targets):
 62 |         # outputs [N, L, C]
 63 |         logit_f = outputs[self.key][:, :-1]
 64 |         logit_b = outputs[self.rev][:, :-1].fliplr()
 65 | 
 66 |         # detect <PAD>
 67 |         text = targets[self.key][:, 1:-1].unsqueeze(-1)
 68 |         mask = ~torch.isin(text.to(self.PAD), self.PAD)
 69 | 
 70 |         # P: target
 71 |         p_f = self.sm_p(logit_b.mul(mask)).detach()
 72 |         p_b = self.sm_p(logit_f.mul(mask)).detach()
 73 | 
 74 |         # Q: output
 75 |         q_f = self.sm_q(logit_f.mul(mask))
 76 |         q_b = self.sm_q(logit_b.mul(mask))
 77 | 
 78 |         return (q_f, p_f), (q_b, p_b), mask
 79 | 
 80 |     def forward(self, outputs, targets, img_metas=None):
 81 |         qp_f, qp_b, mask = self.format(outputs, targets)
 82 |         kl_f = self.loss(*qp_f).div(mask.sum().clamp(1))
 83 |         kl_b = self.loss(*qp_b).div(mask.sum().clamp(1))
 84 |         return {self.loss_key: kl_f, self.loss_rev: kl_b}
 85 | 
 86 | 
 87 | @LOSSES.register_module()
 88 | class BBLoss(nn.Module):
 89 |     def __init__(self, ignore_index: str):
 90 |         super().__init__()
 91 | 
 92 |         # loss
 93 |         self.loss = self.build_loss("sum")
 94 | 
 95 |         # <PAD>
 96 |         pad = torch.tensor(ignore_index)
 97 |         self.register_buffer("PAD", pad)
 98 | 
 99 |     def build_loss(self, reduction):
100 |         return nn.L1Loss(reduction=reduction)
101 | 
102 |     def format(self, outputs, targets):
103 |         # outputs [N, L, 4]
104 |         pred = outputs["bbox"]
105 | 
106 |         # targets [N, L, 4]
107 |         bbox = targets["bbox"][:, 1:].to(pred.device)
108 | 
109 |         # structural tokens
110 |         html = targets["html"][:, 1:].to(pred.device)
111 | 
112 |         # detect <PAD>
113 |         mask = ~torch.eq(html, self.PAD).unsqueeze(-1)
114 | 
115 |         # remove <PAD>
116 |         pred = pred.masked_select(mask)
117 |         bbox = bbox.masked_select(mask)
118 | 
119 |         assert pred.dim() == 1
120 |         assert bbox.dim() == 1
121 | 
122 |         # samples
123 |         pair_h = pred[0::2], bbox[0::2]
124 |         pair_v = pred[1::2], bbox[1::2]
125 | 
126 |         return pair_h, pair_v, mask
127 | 
128 |     def forward(self, outputs, targets, img_metas=None):
129 |         pair_h, pair_v, mask = self.format(outputs, targets)
130 |         loss_h = self.loss(*pair_h).div(mask.sum().clamp(1))
131 |         loss_v = self.loss(*pair_v).div(mask.sum().clamp(1))
132 |         return dict(loss_h=loss_h, loss_v=loss_v)
133 | 


--------------------------------------------------------------------------------
/mutab/models/revisor.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Dict, List
 3 | 
 4 | 
 5 | class Revisor:
 6 |     def __init__(
 7 |         self,
 8 |         SOC: List[str],
 9 |         EOC: List[str],
10 |         template: str,
11 |         patterns: Dict[str, Dict[str, str]],
12 |     ):
13 |         assert isinstance(SOC, list)
14 |         assert isinstance(EOC, list)
15 | 
16 |         self.SOC = SOC
17 |         self.EOC = EOC
18 | 
19 |         assert isinstance(template, str)
20 |         assert isinstance(patterns, dict)
21 | 
22 |         self.template = template
23 |         self.patterns = patterns
24 | 
25 |     def merge(self, html, cell):
26 |         contents = iter(cell)
27 |         internal = False
28 |         restored = []
29 |         for idx, el in enumerate(html):
30 |             if el in self.SOC:
31 |                 internal = True
32 |             if internal and el in self.EOC:
33 |                 ch = "".join(next(contents, ""))
34 |                 el = el.replace("</", f"{ch}</")
35 |                 internal = False
36 |             restored.append(el)
37 |         return "".join(restored)
38 | 
39 |     def clean(self, text):
40 |         for pattern, subpatterns in self.patterns.items():
41 |             section = re.search(pattern, text)
42 |             if section is None:
43 |                 continue
44 |             original = section = section.group()
45 |             for pattern, replace in subpatterns.items():
46 |                 section = re.sub(pattern, replace, section)
47 |             text = text.replace(original, section)
48 |         return self.template.format(text)
49 | 
50 |     def __call__(self, html, cell, **kwargs):
51 |         return self.clean(self.merge(html, cell))
52 | 


--------------------------------------------------------------------------------
/mutab/models/scanner.py:
--------------------------------------------------------------------------------
  1 | from collections import ChainMap
  2 | 
  3 | import torch.distributed as dist
  4 | import torch.nn as nn
  5 | from mmcv.image import imread
  6 | from mmcv.runner import BaseModule, auto_fp16
  7 | from mmocr.apis import model_inference
  8 | 
  9 | from mutab.models import factory
 10 | from mutab.models.factory import DETECTORS
 11 | 
 12 | 
 13 | @DETECTORS.register_module()
 14 | class TableScanner(BaseModule):
 15 |     def __init__(
 16 |         self,
 17 |         backbone,
 18 |         encoder,
 19 |         decoder,
 20 |         handler,
 21 |         html_loss,
 22 |         cell_loss,
 23 |         **kwargs,
 24 |     ):
 25 |         super().__init__()
 26 | 
 27 |         # label handler
 28 |         assert handler is not None
 29 |         self.handler = factory.build_handler(handler)
 30 | 
 31 |         # backbone
 32 |         assert backbone is not None
 33 |         self.backbone = factory.build_backbone(backbone)
 34 | 
 35 |         # encoder module
 36 |         assert encoder is not None
 37 |         self.encoder = factory.build_encoder(encoder)
 38 | 
 39 |         # decoder module
 40 |         assert decoder is not None
 41 |         decoder.update(num_emb_html=self.handler.num_class_html)
 42 |         decoder.update(num_emb_cell=self.handler.num_class_cell)
 43 | 
 44 |         # special tokens (html)
 45 |         decoder.update(SOC_HTML=self.handler.SOC_HTML)
 46 |         decoder.update(SOS_HTML=self.handler.SOS_HTML)
 47 |         decoder.update(EOS_HTML=self.handler.EOS_HTML)
 48 | 
 49 |         # special tokens (cell)
 50 |         decoder.update(SOS_CELL=self.handler.SOS_CELL)
 51 |         decoder.update(EOS_CELL=self.handler.EOS_CELL)
 52 |         decoder.update(SEP_CELL=self.handler.SEP_CELL)
 53 | 
 54 |         self.decoder = factory.build_decoder(decoder)
 55 | 
 56 |         # loss
 57 |         assert isinstance(html_loss, list) and len(html_loss)
 58 |         assert isinstance(cell_loss, list) and len(cell_loss)
 59 | 
 60 |         self.loss = nn.ModuleList()
 61 | 
 62 |         pad_html = dict(ignore_index=self.handler.PAD_HTML)
 63 |         pad_cell = dict(ignore_index=self.handler.PAD_CELL)
 64 | 
 65 |         for loss in html_loss:
 66 |             self.loss.append(factory.build_loss(loss, **pad_html))
 67 |         for loss in cell_loss:
 68 |             self.loss.append(factory.build_loss(loss, **pad_cell))
 69 | 
 70 |         self.init_weights()
 71 | 
 72 |     @property
 73 |     def is_init(self):
 74 |         return True
 75 | 
 76 |     def init_weights(self):
 77 |         for p in self.parameters():
 78 |             if p.dim() > 1:
 79 |                 nn.init.xavier_uniform_(p)
 80 | 
 81 |     @auto_fp16(apply_to=["img"])
 82 |     def forward(self, img, img_metas, return_loss=True, **kwargs):
 83 |         if return_loss:
 84 |             return self.forward_train(img, img_metas)
 85 |         elif isinstance(img_metas[0], list):
 86 |             return self.forward_test(img, img_metas[0])
 87 |         else:
 88 |             return self.forward_test(img, img_metas)
 89 | 
 90 |     def train_step(self, data, optimizer):
 91 |         loss = self.parse_losses(self(**data))
 92 |         loss.update(num_samples=len(data["img_metas"]))
 93 |         return loss
 94 | 
 95 |     def val_step(self, data, optimizer):
 96 |         loss = self.parse_losses(self(**data))
 97 |         loss.update(num_samples=len(data["img_metas"]))
 98 |         return loss
 99 | 
100 |     def parse_losses(self, losses):
101 |         logs = dict({k: v.mean() for k, v in losses.items()})
102 |         loss = sum(v for k, v in logs.items() if "loss" in k)
103 |         logs.update(loss=loss)
104 |         for key, value in logs.items():
105 |             # reduce loss when distributed training
106 |             if dist.is_available() and dist.is_initialized():
107 |                 value = value.data.clone()
108 |                 world = int(dist.get_world_size())
109 |                 dist.all_reduce(value.div_(world))
110 |             logs[key] = value.item()
111 |         return dict(loss=loss, log_vars=logs)
112 | 
113 |     def forward_train(self, image, img_metas):
114 |         targets = self.handler.forward(img_metas)
115 |         outputs = self.decoder(self.encoder(self.backbone(image)), **targets)
116 |         return ChainMap(*[f(outputs, targets, img_metas) for f in self.loss])
117 | 
118 |     def forward_test(self, images, img_metas):
119 |         return self.simple_test(images, img_metas)
120 | 
121 |     def simple_test(self, image, img_metas):
122 |         outputs = self.decoder.predict(self.encoder(self.backbone(image)))
123 |         return self.handler.reverse(**outputs, img_metas=tuple(img_metas))
124 | 
125 |     def predict(self, path: str):
126 |         return dict(path=path, **model_inference(self, imread(path)))
127 | 


--------------------------------------------------------------------------------
/mutab/optimizer/__init__.py:
--------------------------------------------------------------------------------
1 | from mutab.optimizer.factory import OPTIMIZERS
2 | 
3 | __all__ = ["OPTIMIZERS"]
4 | 


--------------------------------------------------------------------------------
/mutab/optimizer/factory.py:
--------------------------------------------------------------------------------
 1 | from inspect import isclass
 2 | 
 3 | from mmcv.runner.optimizer.builder import OPTIMIZERS
 4 | from ranger.ranger2020 import Ranger
 5 | from torch import optim
 6 | from torch.optim import Optimizer
 7 | 
 8 | 
 9 | def register_torch_optimizers():
10 |     for name in dir(optim):
11 |         if name.startswith("__"):
12 |             continue
13 |         _optim = getattr(optim, name)
14 |         if isclass(_optim) and issubclass(_optim, Optimizer):
15 |             if name not in OPTIMIZERS.module_dict.keys():
16 |                 OPTIMIZERS.register_module()(_optim)
17 | 
18 |     if isclass(Ranger) and issubclass(Ranger, Optimizer):
19 |         OPTIMIZERS.register_module()(Ranger)
20 | 
21 | 
22 | register_torch_optimizers()
23 | 


--------------------------------------------------------------------------------
/mutab/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import collect_env, get_logger, pretty_env, visualize_bbox
2 | 
3 | __all__ = ["collect_env", "get_logger", "pretty_env", "visualize_bbox"]
4 | 


--------------------------------------------------------------------------------
/mutab/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import cv2
 4 | import mmcv.utils as utils
 5 | 
 6 | 
 7 | def get_logger(**kwargs):
 8 |     return utils.get_logger("mmdet", **kwargs)
 9 | 
10 | 
11 | def collect_env():
12 |     return dict(**utils.collect_env(), commit=utils.get_git_hash())
13 | 
14 | 
15 | def pretty_env(bar: str):
16 |     contents = list(f"{k}: {v}" for k, v in collect_env().items())
17 |     return "\n".join(["", bar] + contents + [bar, ""])
18 | 
19 | 
20 | def visualize_bbox(bbox, path, save, **kwargs):
21 |     img = cv2.imread(path)
22 |     for x, y, w, h in bbox:
23 |         a = int(x - w / 2), int(y - h / 2)
24 |         b = int(x + w / 2), int(y + h / 2)
25 |         img = cv2.rectangle(img, a, b, (0, 0, 255), thickness=1)
26 |     cv2.imwrite(os.path.join(save, os.path.basename(path)), img)
27 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "mutab"
 3 | version = "0.1.0"
 4 | dependencies = [
 5 |   "apted",
 6 |   "distance",
 7 |   "lxml",
 8 |   "mmcv-full<2",
 9 |   "mmdet<3",
10 |   "mmocr<1",
11 |   "more-itertools",
12 |   "numpy",
13 |   "positional-encodings[pytorch]",
14 |   "ranger@git+https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer",
15 |   "rotary-embedding-torch",
16 |   "torch>=2,<2.1",
17 |   "tqdm",
18 |   "urllib3<2",
19 |   "yapf==0.40.1",
20 | ]
21 | 
22 | [tool.setuptools.packages.find]
23 | include = ["mutab"]
24 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import lzma
 4 | import os
 5 | import pickle
 6 | import time
 7 | from datetime import timedelta as td
 8 | from glob import glob
 9 | from pathlib import Path
10 | 
11 | import numpy as np
12 | from more_itertools import divide
13 | from torch.multiprocessing import set_start_method
14 | from tqdm import tqdm
15 | 
16 | from mutab.apis import evaluate
17 | from mutab.utils import visualize_bbox
18 | 
19 | EASY = "simple"
20 | HARD = "complex"
21 | 
22 | 
23 | def main():
24 |     args = argparse.ArgumentParser()
25 |     args.add_argument("--gpus", type=int, default=4)
26 |     args.add_argument("--show", action="store_true")
27 |     args.add_argument("--ckpt", type=str, default="latest.pth")
28 |     args.add_argument("--save", type=str, default="results.xz")
29 |     args.add_argument("--json", type=str, required=True)
30 |     args.add_argument("--conf", type=str, required=True)
31 |     args.add_argument("--path", type=str, required=True)
32 |     args = args.parse_args()
33 | 
34 |     root = Path(args.ckpt).parent.expanduser()
35 | 
36 |     with open(args.json) as f:
37 |         jsonl_ground_truth = json.load(f)
38 | 
39 |     set_start_method("spawn")
40 |     count = time.perf_counter()
41 |     paths = divide(args.gpus, glob(os.path.join(args.path, "*.png")))
42 |     items = evaluate(paths, args.conf, args.ckpt, jsonl_ground_truth)
43 |     count = td(seconds=time.perf_counter() - count) / td(hours=1)
44 | 
45 |     easy = list(v for v in items.values() if v["type"] == EASY)
46 |     hard = list(v for v in items.values() if v["type"] == HARD)
47 | 
48 |     summary = {}
49 |     summary.update(html=np.mean([v["TEDS"]["html"] for v in items.values()]))
50 |     summary.update(full=np.mean([v["TEDS"]["full"] for v in items.values()]))
51 |     summary.update(easy=np.mean([v["TEDS"]["full"] for v in easy]))
52 |     summary.update(hard=np.mean([v["TEDS"]["full"] for v in hard]))
53 | 
54 |     with open(root.joinpath("{}.log".format(args.save)), "w") as f:
55 |         print(f"{len(items)} samples in {count:.2f} hours:", file=f)
56 |         print(f"AVG TEDS html score: {summary['html']:.4f}", file=f)
57 |         print(f"AVG TEDS full score: {summary['full']:.4f}", file=f)
58 |         print(f"AVG TEDS easy score: {summary['easy']:.4f}", file=f)
59 |         print(f"AVG TEDS hard score: {summary['hard']:.4f}", file=f)
60 | 
61 |     with lzma.open(root.joinpath(args.save), "wb") as f:
62 |         pickle.dump(dict(results=items, summary=summary, **vars(args)), f)
63 | 
64 |     if args.show:
65 |         for name, item in tqdm(list(items.items())):
66 |             visualize_bbox(**item, save=root)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import argparse
 3 | 
 4 | from mmcv import Config
 5 | from mmcv.runner import init_dist
 6 | from torch.multiprocessing import set_start_method
 7 | 
 8 | from mutab.apis import train
 9 | 
10 | 
11 | def main():
12 |     args = argparse.ArgumentParser()
13 |     args.add_argument("config")
14 |     args.add_argument("--work-dir", required=True)
15 |     args.add_argument("--launcher", required=False)
16 |     args, _ = args.parse_known_args()
17 | 
18 |     cfg = Config.fromfile(args.config)
19 |     cfg.update(**vars(args))
20 |     set_start_method("fork")
21 | 
22 |     if args.launcher is not None:
23 |         init_dist(args.launcher, **cfg.dist_params)
24 | 
25 |     train(cfg, args.config)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     main()
30 | 


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ $# -lt 3 ]
 4 | then
 5 |     echo "Usage: bash $0 CONFIG WORK_DIR GPUS"
 6 |     exit
 7 | fi
 8 | 
 9 | BIN=${BIN:-python3}
10 | CONFIG=$1
11 | WORK_DIR=$2
12 | GPUS=$3
13 | 
14 | PORT=${PORT:-29500}
15 | SCRIPT=$(dirname $0)/train.py
16 | 
17 | if [ ${GPUS} == 1 ]; then
18 |     $BIN $SCRIPT $CONFIG --work-dir=${WORK_DIR}
19 | else
20 |     $BIN -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT $SCRIPT $CONFIG --work-dir=${WORK_DIR} --launcher pytorch
21 | fi
22 | 


--------------------------------------------------------------------------------