├── CascadeTab
├── Config
│ └── cascade_mask_rcnn_hrnetv2p_w32_20e.py
├── Functions
│ ├── blessFunc.py
│ ├── borderFunc.py
│ └── line_detection.py
├── border.py
└── main.py
├── Data Preparation
├── Dilation.py
├── Images
│ └── 3img.png
└── Smudge.py
├── Document layout analysis
├── Config file
│ └── cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py
├── ICDAR_XML_to_COCO.py
├── main.py
└── test_train_split.py
├── LICENSE.md
├── README.md
├── Table_detection&Structure_recognition.ipynb
└── literature-survey.md
/CascadeTab/Config/cascade_mask_rcnn_hrnetv2p_w32_20e.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | model = dict(
3 | type='CascadeRCNN',
4 | num_stages=3,
5 | pretrained='open-mmlab://msra/hrnetv2_w32',
6 | backbone=dict(
7 | type='HRNet',
8 | extra=dict(
9 | stage1=dict(
10 | num_modules=1,
11 | num_branches=1,
12 | block='BOTTLENECK',
13 | num_blocks=(4, ),
14 | num_channels=(64, )),
15 | stage2=dict(
16 | num_modules=1,
17 | num_branches=2,
18 | block='BASIC',
19 | num_blocks=(4, 4),
20 | num_channels=(32, 64)),
21 | stage3=dict(
22 | num_modules=4,
23 | num_branches=3,
24 | block='BASIC',
25 | num_blocks=(4, 4, 4),
26 | num_channels=(32, 64, 128)),
27 | stage4=dict(
28 | num_modules=3,
29 | num_branches=4,
30 | block='BASIC',
31 | num_blocks=(4, 4, 4, 4),
32 | num_channels=(32, 64, 128, 256)))),
33 | neck=dict(type='HRFPN', in_channels=[32, 64, 128, 256], out_channels=256),
34 | rpn_head=dict(
35 | type='RPNHead',
36 | in_channels=256,
37 | feat_channels=256,
38 | anchor_scales=[8],
39 | anchor_ratios=[0.5, 1.0, 2.0],
40 | anchor_strides=[4, 8, 16, 32, 64],
41 | target_means=[.0, .0, .0, .0],
42 | target_stds=[1.0, 1.0, 1.0, 1.0],
43 | loss_cls=dict(
44 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
45 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
46 | bbox_roi_extractor=dict(
47 | type='SingleRoIExtractor',
48 | roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
49 | out_channels=256,
50 | featmap_strides=[4, 8, 16, 32]),
51 | bbox_head=[
52 | dict(
53 | type='SharedFCBBoxHead',
54 | num_fcs=2,
55 | in_channels=256,
56 | fc_out_channels=1024,
57 | roi_feat_size=7,
58 | num_classes=81,
59 | target_means=[0., 0., 0., 0.],
60 | target_stds=[0.1, 0.1, 0.2, 0.2],
61 | reg_class_agnostic=True,
62 | loss_cls=dict(
63 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
64 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
65 | dict(
66 | type='SharedFCBBoxHead',
67 | num_fcs=2,
68 | in_channels=256,
69 | fc_out_channels=1024,
70 | roi_feat_size=7,
71 | num_classes=81,
72 | target_means=[0., 0., 0., 0.],
73 | target_stds=[0.05, 0.05, 0.1, 0.1],
74 | reg_class_agnostic=True,
75 | loss_cls=dict(
76 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
77 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
78 | dict(
79 | type='SharedFCBBoxHead',
80 | num_fcs=2,
81 | in_channels=256,
82 | fc_out_channels=1024,
83 | roi_feat_size=7,
84 | num_classes=81,
85 | target_means=[0., 0., 0., 0.],
86 | target_stds=[0.033, 0.033, 0.067, 0.067],
87 | reg_class_agnostic=True,
88 | loss_cls=dict(
89 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
90 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
91 | ],
92 | mask_roi_extractor=dict(
93 | type='SingleRoIExtractor',
94 | roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
95 | out_channels=256,
96 | featmap_strides=[4, 8, 16, 32]),
97 | mask_head=dict(
98 | type='FCNMaskHead',
99 | num_convs=4,
100 | in_channels=256,
101 | conv_out_channels=256,
102 | num_classes=81,
103 | loss_mask=dict(
104 | type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)))
105 | # model training and testing settings
106 | train_cfg = dict(
107 | rpn=dict(
108 | assigner=dict(
109 | type='MaxIoUAssigner',
110 | pos_iou_thr=0.7,
111 | neg_iou_thr=0.3,
112 | min_pos_iou=0.3,
113 | ignore_iof_thr=-1),
114 | sampler=dict(
115 | type='RandomSampler',
116 | num=256,
117 | pos_fraction=0.5,
118 | neg_pos_ub=-1,
119 | add_gt_as_proposals=False),
120 | allowed_border=0,
121 | pos_weight=-1,
122 | debug=False),
123 | rpn_proposal=dict(
124 | nms_across_levels=False,
125 | nms_pre=2000,
126 | nms_post=2000,
127 | max_num=2000,
128 | nms_thr=0.7,
129 | min_bbox_size=0),
130 | rcnn=[
131 | dict(
132 | assigner=dict(
133 | type='MaxIoUAssigner',
134 | pos_iou_thr=0.5,
135 | neg_iou_thr=0.5,
136 | min_pos_iou=0.5,
137 | ignore_iof_thr=-1),
138 | sampler=dict(
139 | type='RandomSampler',
140 | num=512,
141 | pos_fraction=0.25,
142 | neg_pos_ub=-1,
143 | add_gt_as_proposals=True),
144 | mask_size=28,
145 | pos_weight=-1,
146 | debug=False),
147 | dict(
148 | assigner=dict(
149 | type='MaxIoUAssigner',
150 | pos_iou_thr=0.6,
151 | neg_iou_thr=0.6,
152 | min_pos_iou=0.6,
153 | ignore_iof_thr=-1),
154 | sampler=dict(
155 | type='RandomSampler',
156 | num=512,
157 | pos_fraction=0.25,
158 | neg_pos_ub=-1,
159 | add_gt_as_proposals=True),
160 | mask_size=28,
161 | pos_weight=-1,
162 | debug=False),
163 | dict(
164 | assigner=dict(
165 | type='MaxIoUAssigner',
166 | pos_iou_thr=0.7,
167 | neg_iou_thr=0.7,
168 | min_pos_iou=0.7,
169 | ignore_iof_thr=-1),
170 | sampler=dict(
171 | type='RandomSampler',
172 | num=512,
173 | pos_fraction=0.25,
174 | neg_pos_ub=-1,
175 | add_gt_as_proposals=True),
176 | mask_size=28,
177 | pos_weight=-1,
178 | debug=False)
179 | ],
180 | stage_loss_weights=[1, 0.5, 0.25])
181 | test_cfg = dict(
182 | rpn=dict(
183 | nms_across_levels=False,
184 | nms_pre=1000,
185 | nms_post=1000,
186 | max_num=1000,
187 | nms_thr=0.7,
188 | min_bbox_size=0),
189 | rcnn=dict(
190 | score_thr=0.05,
191 | nms=dict(type='nms', iou_thr=0.5),
192 | max_per_img=100,
193 | mask_thr_binary=0.5))
194 | # dataset settings
195 | dataset_type = 'CocoDataset'
196 | data_root = '/content/drive/My Drive/Mmdetection/'
197 | img_norm_cfg = dict(
198 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
199 | train_pipeline = [
200 | dict(type='LoadImageFromFile'),
201 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
202 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
203 | dict(type='RandomFlip', flip_ratio=0.5),
204 | dict(type='Normalize', **img_norm_cfg),
205 | dict(type='Pad', size_divisor=32),
206 | dict(type='DefaultFormatBundle'),
207 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
208 | ]
209 | test_pipeline = [
210 | dict(type='LoadImageFromFile'),
211 | dict(
212 | type='MultiScaleFlipAug',
213 | img_scale=(1333, 800),
214 | flip=False,
215 | transforms=[
216 | dict(type='Resize', keep_ratio=True),
217 | dict(type='RandomFlip'),
218 | dict(type='Normalize', **img_norm_cfg),
219 | dict(type='Pad', size_divisor=32),
220 | dict(type='ImageToTensor', keys=['img']),
221 | dict(type='Collect', keys=['img']),
222 | ])
223 | ]
224 | data = dict(
225 | imgs_per_gpu=2,
226 | workers_per_gpu=2,
227 | train=dict(
228 | type=dataset_type,
229 | ann_file='/content/drive/My Drive/chunk.json',
230 | img_prefix='/content/drive/My Drive/chunk_images/',
231 | pipeline=train_pipeline),
232 | val=dict(
233 | type=dataset_type,
234 | ann_file=data_root + 'VOC2007/test.json',
235 | img_prefix=data_root + 'VOC2007/Test/',
236 | pipeline=test_pipeline),
237 | test=dict(
238 | type=dataset_type,
239 | ann_file=data_root + 'VOC2007/test.json',
240 | img_prefix=data_root + 'VOC2007/Test/',
241 | pipeline=test_pipeline))
242 | # evaluation = dict(interval=1, metric=['bbox'])
243 | # optimizer
244 | optimizer = dict(type='SGD', lr=0.0012, momentum=0.9, weight_decay=0.0001)
245 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
246 | # learning policy
247 | lr_config = dict(
248 | policy='step',
249 | warmup='linear',
250 | warmup_iters=500,
251 | warmup_ratio=1.0 / 3,
252 | step=[16, 19])
253 | checkpoint_config = dict(interval=1,create_symlink=False)
254 | # yapf:disable
255 | log_config = dict(
256 | interval=50,
257 | hooks=[
258 | dict(type='TextLoggerHook'),
259 | # dict(type='TensorboardLoggerHook')
260 | ])
261 | # yapf:enable
262 | # runtime settings
263 | total_epochs = 36
264 | dist_params = dict(backend='nccl')
265 | log_level = 'INFO'
266 | work_dir = '/content/drive/My Drive/Mmdetection/new_chunk_cascade_mask_rcnn_hrnetv2p_w32_20e'
267 | load_from = None
268 | resume_from = '/content/drive/My Drive/Mmdetection/new_chunk_cascade_mask_rcnn_hrnetv2p_w32_20e/epoch_30.pth'
269 | workflow = [('train', 1)]
270 |
--------------------------------------------------------------------------------
/CascadeTab/Functions/blessFunc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cv2
3 | from Functions.borderFunc import extract_table
4 | import lxml.etree as etree
5 | import os
6 |
7 | ## Input : roi of one cell
8 | ## Output : bounding box for the text in that cell
9 | def extractTextBless(img):
10 | return_arr = []
11 | h,w=img.shape[0:2]
12 | base_size=h+14,w+14,3
13 | img_np = np.zeros(base_size,dtype=np.uint8)
14 | cv2.rectangle(img_np,(0,0),(w+14,h+14),(255,255,255),30)
15 | img_np[7:h+7,7:w+7]=img
16 |
17 | gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
18 | # blur = cv2.GaussianBlur(gray,(5,5),0)
19 | ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
20 | rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 1))
21 | dilation = cv2.dilate(thresh1, rect_kernel, iterations = 2)
22 |
23 | contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)
24 | for cnt in (contours):
25 | if cv2.contourArea(cnt) < 20:
26 | continue
27 | x, y, w, h = cv2.boundingRect(cnt)
28 | if(h<6) or w<4 or h/img.shape[0]>0.95 or h>30:
29 | continue
30 | return_arr.append([x-7, y-7, w, h])
31 | return return_arr
32 |
33 | ## Input : Roi of Table , Orignal Image, Cells Detected
34 | ## Output : Returns XML element which has contains bounding box of textchunks
35 | def borderless(table, image, res_cells):
36 | cells = []
37 | x_lines = []
38 | y_lines = []
39 | # padding of the table
40 | print(table)
41 | table[0],table[1],table[2],table[3] = table[0]-15,table[1]-15,table[2]+15,table[3]+15
42 | print (table[0])
43 | print (table[1])
44 | print (table[2])
45 | print (table[3])
46 |
47 | # extracting the cells that might belong to that table
48 | for cell in res_cells:
49 | if cell[0]>table[0]-50 and cell[1]>table[1]-50 and cell[2]
last-15) or (cell[3]temp-15):
69 | if cell[3]>temp:
70 | temp = cell[3]
71 | else:
72 | last = cell[1]
73 | if last > temp:
74 | row.append((last+temp)//2)
75 | if prev is not None:
76 | if ((last+temp)//2) < prev + 10 or ((last+temp)//2) < prev - 10:
77 | row.pop()
78 | prev = (last+temp)//2
79 | temp = cell[3]
80 |
81 | row.append(table[3]+50)
82 | i=1
83 | rows = []
84 | for r in range(len(row)):
85 | rows.append([])
86 | # rows creates a empty matrix with the no. of entries equal to the no. of rows in the table
87 | final_rows = rows
88 | maxr = -111
89 | # print(len(row))
90 | #It stores all the cells to the specific rows
91 | for cell in cells:
92 | if cell[3]=prevr[0]-5) or (r[2]<=prevr[2]+5 and r[2]>=prevr[2]-5):
111 | if r[4]lasty[n][1]:
136 | lasty[n][1] = r[3]
137 | # print("last y:",lasty)
138 |
139 | # taking the mid value of the prev y2 and the cuurent y1 in a row so as to find the right y coordinate for the row line
140 | row = []
141 | row.append(table[1])
142 | prev = None
143 | pr = None
144 | for x in range(len(lasty)-1):
145 | if x==0 and prev==None:
146 | prev = lasty[x]
147 | else:
148 | if pr is not None:
149 | if abs(((lasty[x][0]+prev[1])//2)-pr)<=10:
150 | row.pop()
151 | row.append((lasty[x][0]+prev[1])//2)
152 | else:
153 | row.append((lasty[x][0]+prev[1])//2)
154 | else:
155 | row.append((lasty[x][0]+prev[1])//2)
156 | pr = (lasty[x][0]+prev[1])//2
157 | prev = lasty[x]
158 | row.append(table[3])
159 |
160 | #finding the max. no. of cells in all the rows which is equal to the number of columns
161 | maxr = 0
162 | for r2 in final_rows:
163 | print(r2)
164 | if len(r2)>maxr:
165 | maxr = len(r2)
166 |
167 |
168 | lastx = []
169 | # acc. to the x1 and x2 coordinates of each cell in a row, finding the x1 and x2 coordinates for each column
170 | for n in range(maxr):
171 | lastx.append([999999999,0])
172 |
173 | for r2 in final_rows:
174 | if len(r2)==maxr:
175 | for n,col in enumerate(r2):
176 | # print(col)
177 | if col[2]>lastx[n][1]:
178 | lastx[n][1] = col[2]
179 | if col[0]r2[r][0]):
188 | r +=1
189 | if n != 0:
190 | if r2[r-1][0] > lastx[n-1][1]:
191 | if r2[r-1][0]lastx[n][1]:
198 | lastx[n][1] = col[2]
199 |
200 | # for each column takin the mid value of prev x2 and current x1 so as to draw x1 lines
201 | print(lastx)
202 | col = np.zeros(maxr+1)
203 | col[0] = table[0]
204 | prev = 0
205 | i = 1
206 | for x in range(len(lastx)):
207 | if x==0:
208 | prev = lastx[x]
209 | else:
210 | col[i] = (lastx[x][0]+prev[1])//2
211 | i+=1
212 | prev = lastx[x]
213 | col = col.astype(int)
214 | col[maxr] = table[2]
215 |
216 | _row_ = sorted(row, key=lambda x:x)
217 | _col_ = sorted(col, key=lambda x:x)
218 |
219 | #drawing lines acc. to the values(drawing the row and the column lines)
220 | for no,c in enumerate(_col_):
221 | x_lines.append([c,table[1],c,table[3]])
222 | cv2.line(im2,(c,table[1]),(c,table[3]),(255,0,0),1)
223 | for no,c in enumerate(_row_):
224 | y_lines.append([table[0],c,table[2],c])
225 | cv2.line(im2,(table[0],c),(table[2],c),(255,0,0),1)
226 |
227 | # cv2_imshow(im2)
228 | print("table:",table)
229 | # for r in row:
230 | # cv2.line(im2,(r,table[1]),(r,table[3]),(0,255,0),1)
231 | # for c in col:
232 | # cv2.line(im2,(c,table[1]),(c,table[3]),(0,255,0),1)
233 | final = extract_table(image[table[1]:table[3],table[0]:table[2]],0,(y_lines,x_lines))
234 |
235 | cellBoxes = []
236 | img4 = image.copy()
237 | for box in final:
238 | cellBox = extractTextBless(image[box[1]:box[3],box[0]:box[4]])
239 | for cell in cellBox:
240 | cellBoxes.append([box[0]+cell[0], box[1]+cell[1], cell[2], cell[3]])
241 | cv2.rectangle(img4, (box[0]+cell[0], box[1]+cell[1]), (box[0]+cell[0]+cell[2], box[1]+cell[1]+cell[3]), (255,0,0), 2)
242 |
243 | # cv2_imshow(img4)
244 |
245 | the_last_y = -1
246 | cellBoxes = sorted(cellBoxes,key=lambda x: x[1])
247 | cellBoxes2BeMerged = []
248 | cellBoxes2BeMerged.append([])
249 | rowCnt = 0
250 | for cell in cellBoxes:
251 | if(the_last_y == -1):
252 | the_last_y = cell[1]
253 | cellBoxes2BeMerged[rowCnt].append(cell)
254 | continue
255 | if(abs(cell[1]-the_last_y) < 8):
256 | cellBoxes2BeMerged[rowCnt].append(cell)
257 | else:
258 | the_last_y=cell[1]
259 | rowCnt+=1
260 | cellBoxes2BeMerged.append([])
261 | cellBoxes2BeMerged[rowCnt].append(cell)
262 |
263 | MergedBoxes = []
264 | for cellrow in cellBoxes2BeMerged:
265 | cellrow = sorted(cellrow,key=lambda x: x[0])
266 | cur_cell = -1
267 | for c,cell in enumerate(cellrow):
268 | if(cur_cell == -1):
269 | cur_cell = cell
270 | continue
271 | if(len(cellrow)==1):
272 | MergedBoxes.append(cell)
273 | break
274 | if(abs((cur_cell[0]+cur_cell[2])-cell[0]) < 10):
275 | cur_cell[2] = cur_cell[2] + cell[2] + (cell[0]- (cur_cell[0]+cur_cell[2]))
276 | if(cur_cell[3]= box[0] and mbox[1] >= box[1] and mbox[2] <= box[4] and mbox[3] <= box[3]):
305 | if(len(tcurcell) == 0):
306 | tcurcell = mbox
307 | else:
308 | if(mbox[0] < tcurcell[0]):
309 | tcurcell[0] = mbox[0]
310 | if(mbox[1] < tcurcell[1]):
311 | tcurcell[1] = mbox[1]
312 | if(mbox[2] > tcurcell[2]):
313 | tcurcell[2] = mbox[2]
314 | if(mbox[3] > tcurcell[3]):
315 | tcurcell[3] = mbox[3]
316 |
317 | for i,frow in enumerate(final_rows):
318 | for j,fbox in enumerate(frow):
319 | if(fbox[0] >= box[0] and fbox[0] <= box[4] and fbox[1] >= box[1] and fbox[1] <= box[3]):
320 | mcurcell = fbox
321 | final_rows[i].pop(j)
322 | break
323 |
324 | if(abs(ycnt-box[1])>10):
325 | rcnt+=1
326 | TextChunks.append([])
327 | ycnt = box[1]
328 |
329 | if(len(tcurcell)==0):
330 | if(len(mcurcell)==0):
331 | continue
332 | else:
333 | TextChunks[rcnt].append(mcurcell)
334 | else:
335 | if(len(mcurcell)==0):
336 | TextChunks[rcnt].append(tcurcell)
337 | else:
338 | if(abs(mcurcell[0] - tcurcell[0])<=20 and abs(mcurcell[1] - tcurcell[1])<=20 and abs(mcurcell[2] - tcurcell[2])<=20 and abs(mcurcell[3] - tcurcell[3])<=20):
339 | TextChunks[rcnt].append(tcurcell)
340 | elif((abs(mcurcell[0] - tcurcell[0])<=20 and abs(mcurcell[2] - tcurcell[2])<=20) or (abs(mcurcell[1] - tcurcell[1])<=20 or abs(mcurcell[3] - tcurcell[3])<=20)):
341 | TextChunks[rcnt].append(mcurcell)
342 | else:
343 | TextChunks[rcnt].append(tcurcell)
344 |
345 | colors = [(255,0,0),(0,255,0),(0,0,255),(125,125,0),(0,255,255)]
346 | for no,r in enumerate(TextChunks):
347 | for tbox in r:
348 | cv2.rectangle(im2, (tbox[0], tbox[1]), (tbox[2], tbox[3]), colors[no%len(colors)], 1)
349 | # print(tbox)
350 | # cv2_imshow("text chunks", im2)
351 | # cv2.waitKey(0)
352 |
353 | def rowstart(val):
354 | r = 0
355 | while(val > _row_[r]):
356 | r += 1
357 | if r-1 == -1:
358 | return r
359 | else:
360 | return r-1
361 |
362 | def rowend(val):
363 | r = 0
364 | while(val > _row_[r]):
365 | r += 1
366 | if r-1 == -1:
367 | return r
368 | else:
369 | return r-1
370 |
371 | def colstart(val):
372 | r = 0
373 | while(r < len(_col_) and val > _col_[r]):
374 | r += 1
375 | if r-1 == -1:
376 | return r
377 | else:
378 | return r-1
379 |
380 | def colend(val):
381 | r = 0
382 | while(r < len(_col_) and val > _col_[r]):
383 | r += 1
384 | if r-1 == -1:
385 | return r
386 | else:
387 | return r-1
388 |
389 | tableXML = etree.Element("table")
390 | Tcoords = etree.Element("Coords", points=str(table[0])+","+str(table[1])+" "+str(table[0])+","+str(table[3])+" "+str(table[2])+","+str(table[3])+" "+str(table[2])+","+str(table[1]))
391 | tableXML.append(Tcoords)
392 | for final in TextChunks:
393 | for box in final:
394 | cell = etree.Element("cell")
395 | end_col,end_row,start_col,start_row = colend(box[2]),rowend(box[3]),colstart(box[0]),rowstart(box[1])
396 | cell.set("end-col",str(end_col))
397 | cell.set("end-row",str(end_row))
398 | cell.set("start-col",str(start_col))
399 | cell.set("start-row",str(start_row))
400 |
401 | # print(cellBox)
402 | one = str(box[0])+","+str(box[1])
403 | two = str(box[0])+","+str(box[3])
404 | three = str(box[2])+","+str(box[3])
405 | four = str(box[2])+","+str(box[1])
406 | # print(one)
407 | coords = etree.Element("Coords", points=one+" "+two+" "+three+" "+four)
408 |
409 | cell.append(coords)
410 | tableXML.append(cell)
411 |
412 | return tableXML
413 |
--------------------------------------------------------------------------------
/CascadeTab/Functions/borderFunc.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | from Functions.line_detection import line_detection
3 | import os
4 |
5 | ################## Functions required for Border table Recognition #################
6 |
7 | ## Return the intersection of lines only if intersection is present ##
8 | # Input : x1, y1, x2, y2, x3, y3, x4, y4 (1: vertical, 2: horizontal)
9 | # Output : (x,y) Intersection point
10 | def line_intersection(x1, y1, x2, y2, x3, y3, x4, y4):
11 | # print(x1, y1, x2, y2)
12 | # print(x3, y3, x4, y4)
13 |
14 | if((x1>= x3-5 or x1>= x3+5) and (x1 <= x4+5 or x1 <= x4-5) and (y3+8>=min(y1,y2) or y3-5>=min(y1,y2)) and y3<=max(y1,y2)+5):
15 | return x1,y3
16 |
17 |
18 | ## main extraction function ##
19 | # Input : Image, Decision parameter(1/0),lines for borderless (only of decision parameter is 0)
20 | # Output : Array of cells
21 | def extract_table(table_body,__line__,lines=None):
22 | # Deciding variable
23 | #print (table_body)
24 | print (__line__)
25 | if(__line__ == 1 ):
26 | # Check if table image is bordered or borderless
27 | temp_lines_hor, temp_lines_ver = line_detection(table_body)
28 | print ("temp_lines_hor",temp_lines_hor)
29 | print ("temp_lines_ver",temp_lines_ver)
30 | else:
31 | temp_lines_hor, temp_lines_ver = lines
32 |
33 | if (temp_lines_hor is None) or (temp_lines_ver is None):
34 | print("Either Horizontal Or Vertical Lines Not Detected")
35 | return None
36 |
37 | table = table_body.copy()
38 | x = 0
39 | y = 0
40 | k = 0
41 | points = []
42 | print("[Table status] : Processing table with lines")
43 | # Remove same lines detected closer
44 | for x1, y1, x2, y2 in temp_lines_ver:
45 | point = []
46 | for x3, y3, x4, y4 in temp_lines_hor:
47 | try:
48 | k += 1
49 | x, y = line_intersection(x1, y1, x2, y2, x3, y3, x4, y4)
50 | point.append([x, y])
51 | except:
52 | continue
53 | points.append(point)
54 |
55 | for point in points:
56 | for x,y in point:
57 | cv2.line(table,(x,y),(x,y),(0,0,255),8)
58 |
59 | '''directory = '/content/drive/My Drive/Optum/Dataset/images'
60 | os.chdir(directory)
61 |
62 | filename = "intersection.jpg"
63 | cv2.imwrite(filename, table)'''
64 | # cv2_imshow("intersection",table)
65 | # cv2.waitKey(0)
66 |
67 | # boxno = -1
68 | box = []
69 | flag = 1
70 | lastCache = []
71 | ## creating bounding boxes of cells from the points detected
72 | ## This is still under work and might fail on some images
73 | for i, row in enumerate(points):
74 | limitj = len(row)
75 | currentVala = []
76 | for j, col in enumerate(row):
77 |
78 | if (j == limitj-1):
79 | break
80 | if (i == 0):
81 | nextcol = row[j+1]
82 | lastCache.append([col[0], col[1], nextcol[0], nextcol[1],9999,9999,9999,9999])
83 | else:
84 | nextcol = row[j+1]
85 | currentVala.append([col[0], col[1], nextcol[0], nextcol[1], 9999, 9999, 9999, 9999])
86 | # Matching
87 | flag = 1
88 | index = []
89 | for k, last in enumerate(lastCache):
90 |
91 | if (col[1] == last[1]) and lastCache[k][4] == 9999:
92 | lastCache[k][4] = col[0]
93 | lastCache[k][5] = col[1]
94 | if lastCache[k][4] != 9999 and lastCache[k][6] != 9999:
95 | box.append(lastCache[k])
96 | index.append(k)
97 | flag = 1
98 |
99 | if (nextcol[1] == last[3]) and lastCache[k][6] == 9999:
100 | lastCache[k][6] = nextcol[0]
101 | lastCache[k][7] = nextcol[1]
102 | if lastCache[k][4] != 9999 and lastCache[k][6] != 9999:
103 | box.append(lastCache[k])
104 | index.append(k)
105 | flag = 1
106 |
107 | if len(lastCache) !=0:
108 | if lastCache[k][4] == 9999 or lastCache[k][6] == 9999:
109 | flag = 0
110 | # print(index)
111 | for k in index:
112 | lastCache.pop(k)
113 | # tranfsering
114 | if flag == 0:
115 | for last in lastCache:
116 | if last[4] == 9999 or last[6] == 9999:
117 | currentVala.append(last)
118 |
119 | if(i!=0):
120 | lastCache = currentVala
121 |
122 | ## Visualizing the cells ##
123 | count = 1
124 | for i in box:
125 | cv2.rectangle(table_body, (i[0], i[1]), (i[6], i[7]), (int(i[7]%255),0,int(i[0]%255)), 2)
126 | count+=1
127 |
128 | '''directory = '/content/drive/My Drive/Optum/Dataset/images'
129 | os.chdir(directory)
130 |
131 | filename = "cells.jpg"
132 | cv2.imwrite(filename, table_body)'''
133 | # cv2_imshow("cells",table_body)
134 | # cv2.waitKey(0)
135 | ############################
136 | return box
137 | # extract_table(cv2.imread("E:\\KSK\\KSK ML\\KSK PAPERS\\TabXNet\\For Git\\images\\table.PNG"),1,lines=None)
138 |
139 |
140 | def findX(X,x):
141 | return X.index(x)
142 | def findY(Y,y):
143 | return Y.index(y)
144 |
145 | def span(box,X,Y):
146 | start_col = findX(X,box[0]) ## x1
147 | end_col = findX(X,box[4])-1 ## x3
148 | start_row = findY(Y,box[1]) ## y1
149 | end_row = findY(Y,box[3])-1 ## y2
150 | # print(end_col,end_row,start_col,start_row)
151 | return end_col,end_row,start_col,start_row
152 |
153 |
154 |
155 | def extractText(img):
156 | gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
157 | _, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
158 | # cv2_imshow(thresh1)
159 | rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
160 | dilation = cv2.dilate(thresh1, rect_kernel, iterations = 2)
161 | contours, _ = cv2.findContours(dilation, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)
162 | im2 = img.copy()
163 | mx,my,mw,mh = float('Inf'),float('Inf'),-1,-1
164 | for cnt in contours:
165 | x, y, w, h = cv2.boundingRect(cnt)
166 | # print(im2.shape)
167 | if x<2 or y<2 or (x+w>=im2.shape[1]-1 and y+h>=im2.shape[0]-1) or w>=im2.shape[1]-1 or h>=im2.shape[0]-1:
168 | continue
169 | if xmw:
174 | mw = x+w
175 | if y+h>mh:
176 | mh = y+h
177 | # print(x, y, w, h)
178 |
179 | if mx !=float('Inf') and my !=float('Inf'):
180 | # Drawing a rectangle on copied image
181 | # rect = cv2.rectangle(im2, (mx+1, my), (mw-2, mh-2), (0, 255, 0), 1)
182 | # cv2_imshow(im2)
183 | return mx,my,mw,mh
184 | else :
185 | return None
--------------------------------------------------------------------------------
/CascadeTab/Functions/line_detection.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | import os
4 | import random
5 |
6 | # Input : Image
7 | # Output : hor,ver
8 | def line_detection(image):
9 |
10 | gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
11 | bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, 1)
12 | bw = cv2.bitwise_not(bw)
13 |
14 | '''directory = '/content/drive/My Drive/Optum/Dataset/images'
15 | os.chdir(directory)
16 | n = random.random()
17 | print ("Random number generated is",n)
18 |
19 | filename = str(n) + "after_threshold.jpg"
20 | cv2.imwrite(filename, bw)'''
21 | ## To visualize image after thresholding ##
22 | # cv2_imshow("bw",bw)
23 | # cv2.waitKey(0)
24 | ###########################################
25 | horizontal = bw.copy()
26 | vertical = bw.copy()
27 | img = image.copy()
28 | # [horizontal lines]
29 | # Create structure element for extracting horizontal lines through morphology operations
30 | horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 1))
31 |
32 | # Apply morphology operations
33 | horizontal = cv2.erode(horizontal, horizontalStructure)
34 | horizontal = cv2.dilate(horizontal, horizontalStructure)
35 |
36 | horizontal = cv2.dilate(horizontal, (1,1), iterations=5)
37 | horizontal = cv2.erode(horizontal, (1,1), iterations=5)
38 |
39 | '''directory = '/content/drive/My Drive/Optum/Dataset/images'
40 | os.chdir(directory)
41 | n = random.random()
42 | print ("Random number generated is",n)
43 |
44 | filename = str(n) + "hor_lines_highlighted.jpg"
45 | cv2.imwrite(filename, horizontal)'''
46 |
47 | ## Uncomment to visualize highlighted Horizontal lines
48 | # cv2_imshow("horizontal",horizontal)
49 | # cv2.waitKey(0)
50 |
51 | # HoughlinesP function to detect horizontal lines
52 | hor_lines = cv2.HoughLinesP(horizontal,rho=1,theta=np.pi/180,threshold=100,minLineLength=30,maxLineGap=3)
53 | if hor_lines is None:
54 | return None,None
55 | temp_line = []
56 | for line in hor_lines:
57 | for x1,y1,x2,y2 in line:
58 | temp_line.append([x1,y1-5,x2,y2-5])
59 |
60 | # Sorting the list of detected lines by Y1
61 | hor_lines = sorted(temp_line,key=lambda x: x[1])
62 | print ("hor_lines",hor_lines)
63 |
64 |
65 | ## Uncomment this part to visualize the lines detected on the image ##
66 | print(len(hor_lines))
67 | for x1, y1, x2, y2 in hor_lines:
68 | cv2.line(image, (x1,y1), (x2,y2), (0, 255, 0), 1)
69 |
70 |
71 | '''directory = '/content/drive/My Drive/Optum/Dataset/images'
72 | os.chdir(directory)
73 | n = random.random()
74 | print ("Random number generated is",n)
75 |
76 | filename = str(n) + "hor_lines_detected.jpg"
77 | cv2.imwrite(filename, image)'''
78 | # print(image.shape)
79 | # cv2_imshow("image",image)
80 | # cv2.waitKey(0)
81 | ####################################################################
82 |
83 | ## Selection of best lines from all the horizontal lines detected ##
84 | lasty1 = -111111
85 | lines_x1 = []
86 | lines_x2 = []
87 | hor = []
88 | i=0
89 | for x1,y1,x2,y2 in hor_lines:
90 | if y1 >= lasty1 and y1 <= lasty1 + 10:
91 | lines_x1.append(x1)
92 | lines_x2.append(x2)
93 | else:
94 | if (i != 0 and len(lines_x1) is not 0):
95 | hor.append([min(lines_x1),lasty1,max(lines_x2),lasty1])
96 | lasty1 = y1
97 | lines_x1 = []
98 | lines_x2 = []
99 | lines_x1.append(x1)
100 | lines_x2.append(x2)
101 | i+=1
102 | hor.append([min(lines_x1),lasty1,max(lines_x2),lasty1])
103 | #####################################################################
104 |
105 |
106 | # [vertical lines]
107 | # Create structure element for extracting vertical lines through morphology operations
108 | verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 15))
109 |
110 | # Apply morphology operations
111 | vertical = cv2.erode(vertical, verticalStructure)
112 | vertical = cv2.dilate(vertical, verticalStructure)
113 |
114 | vertical = cv2.dilate(vertical, (1,1), iterations=8)
115 | vertical = cv2.erode(vertical, (1,1), iterations=7)
116 |
117 | ######## Preprocessing Vertical Lines ###############
118 | # cv2_imshow("vertical",vertical)
119 | # cv2.waitKey(0)
120 | #####################################################
121 |
122 | # HoughlinesP function to detect vertical lines
123 | # ver_lines = cv2.HoughLinesP(vertical,rho=1,theta=np.pi/180,threshold=20,minLineLength=20,maxLineGap=2)
124 | ver_lines = cv2.HoughLinesP(vertical, 1, np.pi/180, 20, np.array([]), 20, 2)
125 | if ver_lines is None:
126 | return None,None
127 | temp_line = []
128 | for line in ver_lines:
129 | for x1,y1,x2,y2 in line:
130 | temp_line.append([x1,y1,x2,y2])
131 |
132 | # Sorting the list of detected lines by X1
133 | ver_lines = sorted(temp_line,key=lambda x: x[0])
134 | print ("ver_lines",ver_lines)
135 | ## Uncomment this part to visualize the lines detected on the image ##
136 | print(len(ver_lines))
137 | for x1, y1, x2, y2 in ver_lines:
138 | cv2.line(image, (x1,y1-5), (x2,y2-5), (0, 255, 0), 1)
139 |
140 |
141 | '''directory = '/content/drive/My Drive/Optum/Dataset/images'
142 | os.chdir(directory)
143 | n = random.random()
144 | print ("Random number generated is",n)
145 |
146 | filename = str(n) + "ver_lines_detected.jpg"
147 | cv2.imwrite(filename, image)'''
148 |
149 | # print(image.shape)
150 | # cv2_imshow("image",image)
151 | # cv2.waitKey(0)
152 | ####################################################################
153 |
154 | ## Selection of best lines from all the vertical lines detected ##
155 | lastx1 = -111111
156 | lines_y1 = []
157 | lines_y2 = []
158 | ver = []
159 | count = 0
160 | lasty1 = -11111
161 | lasty2 = -11111
162 | for x1,y1,x2,y2 in ver_lines:
163 | if x1 >= lastx1 and x1 <= lastx1 + 15 and not (((min(y1,y2)table[0]-5 and box[1]>table[1]-5 and box[2].85:
47 | res_border.append(r[:4].astype(int))
48 |
49 | ## for cells
50 | for r in result[0][1]:
51 | print ("2.",r[4])
52 | if r[4]>.85:
53 | r[4] = r[4]*100
54 | res_cell.append(r.astype(int))
55 |
56 | ## for borderless
57 | for r in result[0][2]:
58 | print ("3.",r[4])
59 | if r[4]>.85:
60 | res_bless.append(r[:4].astype(int))
61 |
62 | print ("res_border",res_border)
63 | print ("res_cell",res_cell)
64 | print ("res_bless",res_bless)
65 |
66 | ## if border tables detected
67 | if len(res_border) != 0:
68 | ## call border script for each table in image
69 | for res in res_border:
70 | try:
71 | root.append(border(res,cv2.imread(i)))
72 | except:
73 | pass
74 | if len(res_bless) != 0:
75 | if len(res_cell) != 0:
76 | for no,res in enumerate(res_bless):
77 | root.append(borderless(res,cv2.imread(i),res_cell))
78 |
79 | myfile = open(xmlPath+i.split('/')[-1][:-3]+'xml', "w")
80 | myfile.write('\n')
81 | myfile.write(etree.tostring(root, pretty_print=True,encoding="unicode"))
82 | myfile.close()
--------------------------------------------------------------------------------
/Data Preparation/Dilation.py:
--------------------------------------------------------------------------------
1 | # Note: Image name will be stored as "Dilation_OriginalName" to avoid confict
2 |
3 | import cv2
4 | import glob
5 | import numpy as np
6 |
7 | # DEFINE THE PATH
8 | print ("Entered")
9 | PATH_TO_DEST = "/home/prakhar/try/Dilated Image/"
10 | PATH_TO_ORIGIAL_IMAGES = "/home/prakhar/PublayNet/val/publaynet/val/"
11 |
12 | # if the source directory have other files than images, use extenstion of image
13 | # to get the files ( for example *.png )
14 | img_files = glob.glob(PATH_TO_ORIGIAL_IMAGES+"*.*")
15 | total = len(img_files)
16 |
17 | # 2x2 Static kernal
18 | kernal = np.ones((2,2),np.uint8)
19 |
20 | for count,i in enumerate(img_files):
21 | print (count)
22 | print (i)
23 | image_name = i.split("/")[-1]
24 | print("Progress : ",count,"/",total)
25 | img = cv2.imread(i,0)
26 | _, mask = cv2.threshold(img,220,255,cv2.THRESH_BINARY_INV)
27 | dst = cv2.dilate(mask,kernal,iterations = 1)
28 | dst = cv2.bitwise_not(dst)
29 | cv2.imwrite(PATH_TO_DEST+"/Dilation_"+image_name,dst)
30 |
--------------------------------------------------------------------------------
/Data Preparation/Images/3img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Prakhar-97/Table-detection-and-Document-layout-analysis/bfcd189ee9edb603b734cd07d965a7400b85f820/Data Preparation/Images/3img.png
--------------------------------------------------------------------------------
/Data Preparation/Smudge.py:
--------------------------------------------------------------------------------
1 | # Note: Image name will be stored as "Smudge_OriginalName" to avoid confict
2 | import cv2
3 | import numpy as np
4 | import glob
5 |
6 | def basicTransform(img):
7 | _, mask = cv2.threshold(img,220,255,cv2.THRESH_BINARY_INV)
8 | img = cv2.bitwise_not(mask)
9 | return img
10 |
11 | PATH_TO_DEST = "/home/prakhar/try/Smudged Image/"
12 | PATH_TO_ORIGIAL_IMAGES = "/home/prakhar/PublayNet/val/publaynet/val/"
13 |
14 | img_files = glob.glob(PATH_TO_ORIGIAL_IMAGES+"*.*")
15 |
16 | total = len(img_files)
17 | for count,i in enumerate(img_files):
18 | image_name = i.split("/")[-1]
19 | print("Progress : ",count,"/",total)
20 | img = cv2.imread(i)
21 |
22 | # Split the 3 channels into Blue,Green and Red
23 | b,g,r = cv2.split(img)
24 |
25 | # Apply Basic Transformation
26 | b = basicTransform(b)
27 | r = basicTransform(r)
28 | g = basicTransform(g)
29 |
30 | # Perform the distance transform algorithm
31 | b = cv2.distanceTransform(b, cv2.DIST_L2, 5) # ELCUDIAN
32 | g = cv2.distanceTransform(g, cv2.DIST_L1, 5) # LINEAR
33 | r = cv2.distanceTransform(r, cv2.DIST_C, 5) # MAX
34 |
35 | # Normalize
36 | r = cv2.normalize(r, r, 0, 1.0, cv2.NORM_MINMAX)
37 | g = cv2.normalize(g, g, 0, 1.0, cv2.NORM_MINMAX)
38 | b = cv2.normalize(b, b, 0, 1.0, cv2.NORM_MINMAX)
39 |
40 | # Merge the channels
41 | dist = cv2.merge((b,g,r))
42 | dist = cv2.normalize(dist,dist, 0, 4.0, cv2.NORM_MINMAX)
43 | dist = cv2.cvtColor(dist, cv2.COLOR_BGR2GRAY)
44 |
45 | # In order to save as jpg, or png, we need to handle the Data
46 | # format of image
47 | data = dist.astype(np.float64) / 4.0
48 | data = 1800 * data # Now scale by 1800
49 | dist = data.astype(np.uint16)
50 |
51 | # Save to destination
52 | cv2.imwrite(PATH_TO_DEST+"/Smudge_"+image_name,dist)
53 |
--------------------------------------------------------------------------------
/Document layout analysis/Config file/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py:
--------------------------------------------------------------------------------
1 | model = dict(
2 | type='CascadeRCNN',
3 | pretrained='open-mmlab://msra/hrnetv2_w32',
4 | backbone=dict(
5 | type='HRNet',
6 | extra=dict(
7 | stage1=dict(
8 | num_modules=1,
9 | num_branches=1,
10 | block='BOTTLENECK',
11 | num_blocks=(4, ),
12 | num_channels=(64, )),
13 | stage2=dict(
14 | num_modules=1,
15 | num_branches=2,
16 | block='BASIC',
17 | num_blocks=(4, 4),
18 | num_channels=(32, 64)),
19 | stage3=dict(
20 | num_modules=4,
21 | num_branches=3,
22 | block='BASIC',
23 | num_blocks=(4, 4, 4),
24 | num_channels=(32, 64, 128)),
25 | stage4=dict(
26 | num_modules=3,
27 | num_branches=4,
28 | block='BASIC',
29 | num_blocks=(4, 4, 4, 4),
30 | num_channels=(32, 64, 128, 256)))),
31 | neck=dict(type='HRFPN', in_channels=[32, 64, 128, 256], out_channels=256),
32 | rpn_head=dict(
33 | type='RPNHead',
34 | in_channels=256,
35 | feat_channels=256,
36 | anchor_generator=dict(
37 | type='AnchorGenerator',
38 | scales=[8],
39 | ratios=[0.5, 1.0, 2.0],
40 | strides=[4, 8, 16, 32, 64]),
41 | bbox_coder=dict(
42 | type='DeltaXYWHBBoxCoder',
43 | target_means=[0.0, 0.0, 0.0, 0.0],
44 | target_stds=[1.0, 1.0, 1.0, 1.0]),
45 | loss_cls=dict(
46 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
47 | loss_bbox=dict(
48 | type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=1.0)),
49 | roi_head=dict(
50 | type='CascadeRoIHead',
51 | num_stages=3,
52 | stage_loss_weights=[1, 0.5, 0.25],
53 | bbox_roi_extractor=dict(
54 | type='SingleRoIExtractor',
55 | roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
56 | out_channels=256,
57 | featmap_strides=[4, 8, 16, 32]),
58 | bbox_head=[
59 | dict(
60 | type='Shared2FCBBoxHead',
61 | in_channels=256,
62 | fc_out_channels=1024,
63 | roi_feat_size=7,
64 | num_classes=80,
65 | bbox_coder=dict(
66 | type='DeltaXYWHBBoxCoder',
67 | target_means=[0.0, 0.0, 0.0, 0.0],
68 | target_stds=[0.1, 0.1, 0.2, 0.2]),
69 | reg_class_agnostic=True,
70 | loss_cls=dict(
71 | type='CrossEntropyLoss',
72 | use_sigmoid=False,
73 | loss_weight=1.0),
74 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
75 | loss_weight=1.0)),
76 | dict(
77 | type='Shared2FCBBoxHead',
78 | in_channels=256,
79 | fc_out_channels=1024,
80 | roi_feat_size=7,
81 | num_classes=80,
82 | bbox_coder=dict(
83 | type='DeltaXYWHBBoxCoder',
84 | target_means=[0.0, 0.0, 0.0, 0.0],
85 | target_stds=[0.05, 0.05, 0.1, 0.1]),
86 | reg_class_agnostic=True,
87 | loss_cls=dict(
88 | type='CrossEntropyLoss',
89 | use_sigmoid=False,
90 | loss_weight=1.0),
91 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
92 | loss_weight=1.0)),
93 | dict(
94 | type='Shared2FCBBoxHead',
95 | in_channels=256,
96 | fc_out_channels=1024,
97 | roi_feat_size=7,
98 | num_classes=80,
99 | bbox_coder=dict(
100 | type='DeltaXYWHBBoxCoder',
101 | target_means=[0.0, 0.0, 0.0, 0.0],
102 | target_stds=[0.033, 0.033, 0.067, 0.067]),
103 | reg_class_agnostic=True,
104 | loss_cls=dict(
105 | type='CrossEntropyLoss',
106 | use_sigmoid=False,
107 | loss_weight=1.0),
108 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
109 | ],
110 | mask_roi_extractor=dict(
111 | type='SingleRoIExtractor',
112 | roi_layer=dict(type='RoIAlign', out_size=14, sample_num=0),
113 | out_channels=256,
114 | featmap_strides=[4, 8, 16, 32]),
115 | mask_head=dict(
116 | type='FCNMaskHead',
117 | num_convs=4,
118 | in_channels=256,
119 | conv_out_channels=256,
120 | num_classes=80,
121 | loss_mask=dict(
122 | type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
123 | train_cfg = dict(
124 | rpn=dict(
125 | assigner=dict(
126 | type='MaxIoUAssigner',
127 | pos_iou_thr=0.7,
128 | neg_iou_thr=0.3,
129 | min_pos_iou=0.3,
130 | match_low_quality=True,
131 | ignore_iof_thr=-1),
132 | sampler=dict(
133 | type='RandomSampler',
134 | num=256,
135 | pos_fraction=0.5,
136 | neg_pos_ub=-1,
137 | add_gt_as_proposals=False),
138 | allowed_border=0,
139 | pos_weight=-1,
140 | debug=False),
141 | rpn_proposal=dict(
142 | nms_across_levels=False,
143 | nms_pre=2000,
144 | nms_post=2000,
145 | max_num=2000,
146 | nms_thr=0.7,
147 | min_bbox_size=0),
148 | rcnn=[
149 | dict(
150 | assigner=dict(
151 | type='MaxIoUAssigner',
152 | pos_iou_thr=0.5,
153 | neg_iou_thr=0.5,
154 | min_pos_iou=0.5,
155 | match_low_quality=False,
156 | ignore_iof_thr=-1),
157 | sampler=dict(
158 | type='RandomSampler',
159 | num=512,
160 | pos_fraction=0.25,
161 | neg_pos_ub=-1,
162 | add_gt_as_proposals=True),
163 | mask_size=28,
164 | pos_weight=-1,
165 | debug=False),
166 | dict(
167 | assigner=dict(
168 | type='MaxIoUAssigner',
169 | pos_iou_thr=0.6,
170 | neg_iou_thr=0.6,
171 | min_pos_iou=0.6,
172 | match_low_quality=False,
173 | ignore_iof_thr=-1),
174 | sampler=dict(
175 | type='RandomSampler',
176 | num=512,
177 | pos_fraction=0.25,
178 | neg_pos_ub=-1,
179 | add_gt_as_proposals=True),
180 | mask_size=28,
181 | pos_weight=-1,
182 | debug=False),
183 | dict(
184 | assigner=dict(
185 | type='MaxIoUAssigner',
186 | pos_iou_thr=0.7,
187 | neg_iou_thr=0.7,
188 | min_pos_iou=0.7,
189 | match_low_quality=False,
190 | ignore_iof_thr=-1),
191 | sampler=dict(
192 | type='RandomSampler',
193 | num=512,
194 | pos_fraction=0.25,
195 | neg_pos_ub=-1,
196 | add_gt_as_proposals=True),
197 | mask_size=28,
198 | pos_weight=-1,
199 | debug=False)
200 | ])
201 | test_cfg = dict(
202 | rpn=dict(
203 | nms_across_levels=False,
204 | nms_pre=1000,
205 | nms_post=1000,
206 | max_num=1000,
207 | nms_thr=0.7,
208 | min_bbox_size=0),
209 | rcnn=dict(
210 | score_thr=0.05,
211 | nms=dict(type='nms', iou_thr=0.5),
212 | max_per_img=100,
213 | mask_thr_binary=0.5))
214 | dataset_type = 'CocoDataset'
215 | data_root = '/content/drive/My Drive/all_data/'
216 | classes = ('text', 'title', 'list', 'table', 'figure')
217 | img_norm_cfg = dict(
218 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
219 | train_pipeline = [
220 | dict(type='LoadImageFromFile'),
221 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
222 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
223 | dict(type='RandomFlip', flip_ratio=0.5),
224 | dict(
225 | type='Normalize',
226 | mean=[123.675, 116.28, 103.53],
227 | std=[58.395, 57.12, 57.375],
228 | to_rgb=True),
229 | dict(type='Pad', size_divisor=32),
230 | dict(type='DefaultFormatBundle'),
231 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
232 | ]
233 | test_pipeline = [
234 | dict(type='LoadImageFromFile'),
235 | dict(
236 | type='MultiScaleFlipAug',
237 | img_scale=(1333, 800),
238 | flip=False,
239 | transforms=[
240 | dict(type='Resize', keep_ratio=True),
241 | dict(type='RandomFlip'),
242 | dict(
243 | type='Normalize',
244 | mean=[123.675, 116.28, 103.53],
245 | std=[58.395, 57.12, 57.375],
246 | to_rgb=True),
247 | dict(type='Pad', size_divisor=32),
248 | dict(type='ImageToTensor', keys=['img']),
249 | dict(type='Collect', keys=['img'])
250 | ])
251 | ]
252 | data = dict(
253 | samples_per_gpu=2,
254 | workers_per_gpu=2,
255 | train=dict(
256 | type='CocoDataset',
257 | ann_file=
258 | '/content/drive/My Drive/all_data/annotations/train_publaynet.json',
259 | img_prefix='/content/drive/My Drive/all_data/train/',
260 | classes=('text', 'title', 'list', 'table', 'figure'),
261 | pipeline=[
262 | dict(type='LoadImageFromFile'),
263 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
264 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
265 | dict(type='RandomFlip', flip_ratio=0.5),
266 | dict(
267 | type='Normalize',
268 | mean=[123.675, 116.28, 103.53],
269 | std=[58.395, 57.12, 57.375],
270 | to_rgb=True),
271 | dict(type='Pad', size_divisor=32),
272 | dict(type='DefaultFormatBundle'),
273 | dict(
274 | type='Collect',
275 | keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
276 | ]),
277 | val=dict(
278 | type='CocoDataset',
279 | ann_file=
280 | '/content/drive/My Drive/all_data/annotations/val_publaynet.json',
281 | img_prefix='/content/drive/My Drive/all_data/validation/',
282 | classes=('text', 'title', 'list', 'table', 'figure'),
283 | pipeline=[
284 | dict(type='LoadImageFromFile'),
285 | dict(
286 | type='MultiScaleFlipAug',
287 | img_scale=(1333, 800),
288 | flip=False,
289 | transforms=[
290 | dict(type='Resize', keep_ratio=True),
291 | dict(type='RandomFlip'),
292 | dict(
293 | type='Normalize',
294 | mean=[123.675, 116.28, 103.53],
295 | std=[58.395, 57.12, 57.375],
296 | to_rgb=True),
297 | dict(type='Pad', size_divisor=32),
298 | dict(type='ImageToTensor', keys=['img']),
299 | dict(type='Collect', keys=['img'])
300 | ])
301 | ]),
302 | test=dict(
303 | type='CocoDataset',
304 | ann_file=
305 | '/content/drive/My Drive/all_data/annotations/val_publaynet.json',
306 | img_prefix='/content/drive/My Drive/all_data/validation/',
307 | classes=('text', 'title', 'list', 'table', 'figure'),
308 | pipeline=[
309 | dict(type='LoadImageFromFile'),
310 | dict(
311 | type='MultiScaleFlipAug',
312 | img_scale=(1333, 800),
313 | flip=False,
314 | transforms=[
315 | dict(type='Resize', keep_ratio=True),
316 | dict(type='RandomFlip'),
317 | dict(
318 | type='Normalize',
319 | mean=[123.675, 116.28, 103.53],
320 | std=[58.395, 57.12, 57.375],
321 | to_rgb=True),
322 | dict(type='Pad', size_divisor=32),
323 | dict(type='ImageToTensor', keys=['img']),
324 | dict(type='Collect', keys=['img'])
325 | ])
326 | ]))
327 | evaluation = dict(interval=1, metric=['bbox', 'segm'])
328 | optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0001)
329 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
330 | lr_config = dict(
331 | policy='step',
332 | warmup='linear',
333 | warmup_iters=500,
334 | warmup_ratio=0.3333333333333333,
335 | step=[16, 19])
336 | total_epochs = 20
337 | checkpoint_config = dict(interval=1, create_symlink=False)
338 | log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
339 | dist_params = dict(backend='nccl')
340 | log_level = 'INFO'
341 | load_from = None
342 | resume_from = '/content/drive/My Drive/mmdetection/tools/work_dirs/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/epoch_9.pth'
343 | workflow = [('train', 1)]
344 | work_dir = './work_dirs/cascade_mask_rcnn_hrnetv2p_w32_20e_coco'
345 | gpu_ids = range(0, 1)
346 |
--------------------------------------------------------------------------------
/Document layout analysis/ICDAR_XML_to_COCO.py:
--------------------------------------------------------------------------------
1 | from pdf2image import convert_from_path
2 | from pdf2image.exceptions import (PDFInfoNotInstalledError, PDFPageCountError,PDFSyntaxError)
3 | from bs4 import BeautifulSoup as bs
4 | import glob
5 | import os
6 | import cv2
7 | import numpy as np
8 | import pprint
9 | import pickle
10 | import json
11 |
12 | pdf_path = '/home/prakhar/mmdetection/convert2pdf/all pdfs/*'
13 | xml_path = '/home/prakhar/mmdetection/convert2pdf/all xmls/*'
14 | pdfs = glob.glob(pdf_path)
15 | xmls = glob.glob(xml_path)
16 | a = 1
17 | b = 0
18 | img_list = []
19 | ann_list = []
20 | cate_list = []
21 | super_dict = {}
22 | categories = {}
23 |
24 | for i in pdfs:
25 |
26 | print (i)
27 | tail_pdf = os.path.split(i)
28 | name_pdf = os.path.splitext(tail_pdf[1])
29 | #print(tail_pdf[1])
30 | #print (name_pdf[0])
31 |
32 | for j in xmls :
33 |
34 | tail_xml = os.path.split(j)
35 | name_xml = os.path.splitext(tail_xml[1])
36 | #print(tail_xml[1])
37 | #print (name_xml[0])
38 |
39 | if (name_pdf[0]+"-reg" == name_xml[0]):
40 | pages = convert_from_path(i)
41 |
42 | for k, page in enumerate(pages):
43 |
44 | image = {}
45 |
46 | fname = name_pdf[0] + "_page_" + str(k+1) + ".png"
47 | print (j)
48 |
49 | content = []
50 | with open(j, 'r') as file:
51 | #import pdb ; pdb.set_trace()
52 |
53 | content = file.readlines()
54 | content ="".join(content)
55 | bs_content = bs(content, "lxml")
56 |
57 | table = bs_content.find_all("region")
58 | #print (table)
59 | coords = []
60 |
61 | for p in table:
62 |
63 | ann = {}
64 | masks = []
65 | num = p["page"]
66 | #print (num)
67 | #print (k+1)
68 | if num == str(k+1) :
69 |
70 | b = b+1
71 | length = len(p.contents)
72 | bbox = p.contents[length-2]
73 |
74 | x1 = int(bbox["x1"])
75 | #print (x1)
76 | y1 = int(bbox["y1"])
77 | w = int(bbox["x2"])-int(bbox["x1"])
78 | #print (w)
79 | h = int(bbox["y2"])-int(bbox["y1"])
80 | #print (h)
81 |
82 | coords = [x1, y1, w, h]
83 | mask = [x1, y1, x1, y1+h, x1+w, y1+h, x1+w, y1]
84 | masks.append(mask)
85 |
86 | ann["area"] = float(w*h)
87 | ann["bbox"] = coords
88 | ann["segmentation"] = masks
89 | ann["category_id"] = 1
90 | ann["image_id"] = a
91 | ann["id"] = b
92 | ann["iscrowd"] = 0
93 | ann["ignore"] = 0
94 | ann_list.append(ann)
95 |
96 | if len(coords) > 0 :
97 |
98 | page.save(fname, "PNG")
99 |
100 | img = cv2.imread(fname)
101 | dimensions = img.shape
102 | height = img.shape[0]
103 | width = img.shape[1]
104 | channels = img.shape[2]
105 |
106 | #print('Image Dimension : ',dimensions)
107 | #print('Image Height : ',height)
108 | #print('Image Width : ',width)
109 | #print('Number of Channels : ',channels)
110 |
111 | image["file_name"] = fname
112 | image["width"] = width
113 | image["height"] = height
114 | image["id"] = a
115 |
116 | img_list.append(image)
117 | a = a+1
118 |
119 | categories["id"] = 1
120 | categories["name"] = "table"
121 | cate_list.append(categories)
122 |
123 | super_dict["annotations"] = ann_list
124 | super_dict["categories"] = cate_list
125 | super_dict["images"] = img_list
126 | super_dict["type"] = "instances"
127 |
128 | filename = 'dataset'
129 | outfile = open(filename,'wb')
130 | pickle.dump(super_dict, outfile)
131 | outfile.close()
132 |
133 | with open("dataset.json", 'w') as outfile:
134 | json.dump(super_dict, outfile)
135 | #print (pickled_object)
136 | #unpickled_object = pickle.load(open(filename, 'rb'))
137 | #print (unpickled_object)
138 |
139 | #a = CustomDataset(pickle.loads(pickled_object))
140 | pp = pprint.PrettyPrinter(indent=4)
141 | pp.pprint (super_dict)
142 |
--------------------------------------------------------------------------------
/Document layout analysis/main.py:
--------------------------------------------------------------------------------
1 | from mmdet.apis import inference_detector, show_result_pyplot, init_detector
2 | from mmdet.core import encode_mask_results, tensor2imgs
3 | import cv2
4 | import os
5 |
6 | ################################################### TO DO ###################################################
7 | image_pth = 'Give the image path'
8 |
9 | config_fname = "Give the config file path "
10 | checkpoint_path = 'Give the checkpoint file path'
11 | epoch = 'epoch_6.pth'
12 |
13 | #############################################################################################################
14 |
15 | model = init_detector(config_fname, checkpoint_path+epoch)
16 | img = cv2.imread(image_pth)
17 |
18 | result = inference_detector(model, img)
19 | #print ("The result is = ",result)
20 |
21 | results = []
22 | bbox_results, mask_results = result
23 |
24 | res_text= []
25 | res_title = []
26 | res_list = []
27 | res_table = []
28 | res_figure = []
29 | all_classes = []
30 |
31 | #for text
32 | for r in bbox_results[0]:
33 | if r[4]>.85:
34 | res_text.append(r[:4].astype(int))
35 |
36 | print ("No. of paragraphs on the page are == ",len(res_text))
37 | all_classes.append(res_text)
38 |
39 | #for title
40 | for r in bbox_results[1]:
41 | if r[4]>.85:
42 | res_title.append(r[:4].astype(int))
43 |
44 | print ("No. of headers on the page are == ",len(res_title))
45 | all_classes.append(res_title)
46 |
47 | #for list
48 | for r in bbox_results[2]:
49 | if r[4]>.85:
50 | res_list.append(r[:4].astype(int))
51 |
52 | print ("No. of lists on the page are == ",len(res_list))
53 | all_classes.append(res_list)
54 |
55 | #for table
56 | for r in bbox_results[3]:
57 | if r[4]>.85:
58 | res_table.append(r[:4].astype(int))
59 |
60 | print ("No. of the tables on the page are == ",len(res_table))
61 | all_classes.append(res_table)
62 |
63 | #for figure
64 | for r in bbox_results[4]:
65 | if r[4]>.85:
66 | res_figure.append(r[:4].astype(int))
67 |
68 | print ("No. of figures on the page are == ",len(res_figure))
69 | all_classes.append(res_figure)
70 |
71 | im2 = img.copy()
72 | for count, category in enumerate(all_classes):
73 | #print ("The no. of bbox in these classes are == ",len(category))
74 | im1 = img.copy()
75 | colors = [(55,255,20), (0,0,255), (132,240,255), (0,247,255), (2,2,105)]
76 | filename = ["paragraph_boxes.jpg", "header_boxes.jpg", "list_boxes.jpg", "tabel_boxes.jpg", "figure_boxes.jpg"]
77 |
78 | for box in category :
79 | #print (count)
80 | #print(colors[count])
81 | cv2.rectangle(im1, (box[0], box[1]), (box[2], box[3]), colors[count], 2)
82 | cv2.rectangle(im2, (box[0], box[1]), (box[2], box[3]), colors[count], 2)
83 |
84 | directory = '/content/drive/My Drive/results'
85 | os.chdir(directory)
86 | name = filename[count]
87 | #print (name)
88 | cv2.imwrite(name, im1)
89 |
90 | directory = '/content/drive/My Drive/results'
91 | os.chdir(directory)
92 | result_file = "all_annotations.jpg"
93 | cv2.imwrite(result_file, im2)
94 |
95 | encoded_mask_results = encode_mask_results(mask_results)
96 | print ("Encoded mask results are == ",encoded_mask_results)
97 | result = bbox_results, encoded_mask_results
98 |
99 | results.append(result)
--------------------------------------------------------------------------------
/Document layout analysis/test_train_split.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sklearn
3 | import os
4 | import glob
5 | import pprint
6 | import shutil
7 |
8 | img_source_dir = '/home/prakhar/Publaynet/Original_data'
9 | train = '/home/prakhar/Publaynet/train'
10 | val = '/home/prakhar/Publaynet/validation'
11 | subdirs = []
12 | ratio = 0.7
13 | for subdir in os.listdir(img_source_dir):
14 |
15 | print (subdir)
16 | a = os.path.join(img_source_dir, subdir)
17 | subdirs.append(a)
18 |
19 | print (subdirs)
20 |
21 | elements = len(subdirs)
22 | middle = int(elements*ratio)
23 |
24 | train_list = subdirs[:middle]
25 | val_list = subdirs[middle:]
26 |
27 | for f in train_list:
28 | shutil.move(f, train)
29 |
30 | for f in val_list:
31 | shutil.move(f, val)
32 |
33 |
34 | train_path = '/home/prakhar/Publaynet/train/*'
35 | val_path = '/home/prakhar/Publaynet/validation/*'
36 |
37 | train_imgs = glob.glob(train_path)
38 | #print (train_imgs)
39 | val_imgs = glob.glob(val_path)
40 | #print (val_imgs)
41 |
42 | with open('/home/prakhar/Publaynet/Labels/val.json') as f:
43 | data = json.load(f)
44 |
45 | #pp = pprint.PrettyPrinter(indent=4)
46 | #pp.pprint (data)
47 |
48 | def create_dict(imgs, data):
49 |
50 | train_ann = []
51 | name = []
52 | super_dict = {}
53 | total = len(imgs)
54 |
55 | for count,i in enumerate(imgs):
56 |
57 | print("Progress : ",count,"/",total)
58 | image_name = os.path.split(i)
59 | name_list = data["images"]
60 |
61 | for j in name_list:
62 |
63 | if j["file_name"] == image_name[1]:
64 |
65 | num = j["id"]
66 | ann = data["annotations"]
67 | name.append(j)
68 |
69 | for k in ann:
70 |
71 | if k["image_id"] == num:
72 |
73 | train_ann.append(k)
74 |
75 | super_dict["annotations"] = train_ann
76 | super_dict["images"] = name
77 | super_dict["categories"] = data["categories"]
78 |
79 | return (super_dict)
80 |
81 | print ("For train")
82 | train_dict = create_dict(train_imgs, data)
83 | pp = pprint.PrettyPrinter(indent=4)
84 | pp.pprint (train_dict)
85 | with open("train_publaynet.json", 'w') as outfile:
86 | json.dump(train_dict, outfile)
87 |
88 | print ("For val")
89 | val_dict = create_dict(val_imgs, data)
90 | pp = pprint.PrettyPrinter(indent=4)
91 | pp.pprint (val_dict)
92 | with open("val_publaynet.json", 'w') as f:
93 | json.dump(val_dict, f)
94 |
95 |
96 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Prakhar-97
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Table-detection-and-Document-layout-analysis
2 | ## Introduction
3 | Using State of the Art techniques for table detection and Document layout analysis. For table detection we are using MMDetection version(1.2), however in Document layout analysis we are using the models which have been developed in MMDetection version(2.0)
4 |
5 | ## Setup
6 | Models are developed in Pytorch based MMdetection framework (Version 2.0)
7 |
8 |
9 |
10 | git clone -'https://github.com/open-mmlab/mmdetection.git'
11 | cd "mmdetection"
12 | python setup.py install
13 | python setup.py develop
14 | pip install -r {"requirements.txt"}
15 |
16 |
17 | ## Image Augmentation
18 | We have followed Dilation and Smudge techniques for Data Augmentation
19 |
20 | 
21 |
22 |
23 | ## Model Zoo
24 | Config file for the Models :
25 |
26 |
27 | 1. For table detection
28 | Config_file
29 |
30 | 2. For Document Analysis
31 | Config_file
32 |
33 | Note: Config paths are only required to change during training
34 |
35 | Checkpoints of the Models that have been trained :
36 |
37 |
38 |
39 | Model Name | Checkpoint File |
40 |
41 |
42 | Table structure recognition | Checkpoint |
43 |
44 |
45 | Document layout analysis | Checkpoint |
46 |
47 |
48 |
49 | ## Datasets
50 | 1. Table detection and Structure Recignition:
51 | You can refer to Dataset to have a better understanding of the Dataset
52 |
53 | 2. Document layout Analysis:
54 | You can refer to Dataset to have a better understanding of the dataset.
55 |
56 | ## Training
57 |
58 | Refer to the two colab notebooks thathave been mentioned as they will direct you through the steps that need to be followed. If using a custom dataset do go through MMdet Docs
59 |
60 |
61 |
--------------------------------------------------------------------------------
/literature-survey.md:
--------------------------------------------------------------------------------
1 | # Document layout analysis
2 |
3 | ## Datasets
4 | [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/)
5 | * 37 GB
6 | * image classification with 16 classes
7 |
8 | [DocBank](https://arxiv.org/pdf/2006.01038v1.pdf)
9 | * Yet to be released
10 | * author, footer, section, title, abstract, list, paragraph, reference, caption, equation, figure, table
11 |
12 | Other PubLayNet implementations
13 | * [with torch's maskrcnn](https://github.com/phamquiluan/publaynet)
14 | * [with detectron](https://github.com/hpanwar08/detectron2)
15 |
--------------------------------------------------------------------------------
|