├── CascadeTab ├── Config │ └── cascade_mask_rcnn_hrnetv2p_w32_20e.py ├── Functions │ ├── blessFunc.py │ ├── borderFunc.py │ └── line_detection.py ├── border.py └── main.py ├── Data Preparation ├── Dilation.py ├── Images │ └── 3img.png └── Smudge.py ├── Document layout analysis ├── Config file │ └── cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py ├── ICDAR_XML_to_COCO.py ├── main.py └── test_train_split.py ├── LICENSE.md ├── README.md ├── Table_detection&Structure_recognition.ipynb └── literature-survey.md /CascadeTab/Config/cascade_mask_rcnn_hrnetv2p_w32_20e.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | model = dict( 3 | type='CascadeRCNN', 4 | num_stages=3, 5 | pretrained='open-mmlab://msra/hrnetv2_w32', 6 | backbone=dict( 7 | type='HRNet', 8 | extra=dict( 9 | stage1=dict( 10 | num_modules=1, 11 | num_branches=1, 12 | block='BOTTLENECK', 13 | num_blocks=(4, ), 14 | num_channels=(64, )), 15 | stage2=dict( 16 | num_modules=1, 17 | num_branches=2, 18 | block='BASIC', 19 | num_blocks=(4, 4), 20 | num_channels=(32, 64)), 21 | stage3=dict( 22 | num_modules=4, 23 | num_branches=3, 24 | block='BASIC', 25 | num_blocks=(4, 4, 4), 26 | num_channels=(32, 64, 128)), 27 | stage4=dict( 28 | num_modules=3, 29 | num_branches=4, 30 | block='BASIC', 31 | num_blocks=(4, 4, 4, 4), 32 | num_channels=(32, 64, 128, 256)))), 33 | neck=dict(type='HRFPN', in_channels=[32, 64, 128, 256], out_channels=256), 34 | rpn_head=dict( 35 | type='RPNHead', 36 | in_channels=256, 37 | feat_channels=256, 38 | anchor_scales=[8], 39 | anchor_ratios=[0.5, 1.0, 2.0], 40 | anchor_strides=[4, 8, 16, 32, 64], 41 | target_means=[.0, .0, .0, .0], 42 | target_stds=[1.0, 1.0, 1.0, 1.0], 43 | loss_cls=dict( 44 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 45 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), 46 | bbox_roi_extractor=dict( 47 | type='SingleRoIExtractor', 48 | roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), 49 | out_channels=256, 50 | featmap_strides=[4, 8, 16, 32]), 51 | bbox_head=[ 52 | dict( 53 | type='SharedFCBBoxHead', 54 | num_fcs=2, 55 | in_channels=256, 56 | fc_out_channels=1024, 57 | roi_feat_size=7, 58 | num_classes=81, 59 | target_means=[0., 0., 0., 0.], 60 | target_stds=[0.1, 0.1, 0.2, 0.2], 61 | reg_class_agnostic=True, 62 | loss_cls=dict( 63 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 64 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), 65 | dict( 66 | type='SharedFCBBoxHead', 67 | num_fcs=2, 68 | in_channels=256, 69 | fc_out_channels=1024, 70 | roi_feat_size=7, 71 | num_classes=81, 72 | target_means=[0., 0., 0., 0.], 73 | target_stds=[0.05, 0.05, 0.1, 0.1], 74 | reg_class_agnostic=True, 75 | loss_cls=dict( 76 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 77 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), 78 | dict( 79 | type='SharedFCBBoxHead', 80 | num_fcs=2, 81 | in_channels=256, 82 | fc_out_channels=1024, 83 | roi_feat_size=7, 84 | num_classes=81, 85 | target_means=[0., 0., 0., 0.], 86 | target_stds=[0.033, 0.033, 0.067, 0.067], 87 | reg_class_agnostic=True, 88 | loss_cls=dict( 89 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), 90 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) 91 | ], 92 | mask_roi_extractor=dict( 93 | type='SingleRoIExtractor', 94 | roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2), 95 | out_channels=256, 96 | featmap_strides=[4, 8, 16, 32]), 97 | mask_head=dict( 98 | type='FCNMaskHead', 99 | num_convs=4, 100 | in_channels=256, 101 | conv_out_channels=256, 102 | num_classes=81, 103 | loss_mask=dict( 104 | type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))) 105 | # model training and testing settings 106 | train_cfg = dict( 107 | rpn=dict( 108 | assigner=dict( 109 | type='MaxIoUAssigner', 110 | pos_iou_thr=0.7, 111 | neg_iou_thr=0.3, 112 | min_pos_iou=0.3, 113 | ignore_iof_thr=-1), 114 | sampler=dict( 115 | type='RandomSampler', 116 | num=256, 117 | pos_fraction=0.5, 118 | neg_pos_ub=-1, 119 | add_gt_as_proposals=False), 120 | allowed_border=0, 121 | pos_weight=-1, 122 | debug=False), 123 | rpn_proposal=dict( 124 | nms_across_levels=False, 125 | nms_pre=2000, 126 | nms_post=2000, 127 | max_num=2000, 128 | nms_thr=0.7, 129 | min_bbox_size=0), 130 | rcnn=[ 131 | dict( 132 | assigner=dict( 133 | type='MaxIoUAssigner', 134 | pos_iou_thr=0.5, 135 | neg_iou_thr=0.5, 136 | min_pos_iou=0.5, 137 | ignore_iof_thr=-1), 138 | sampler=dict( 139 | type='RandomSampler', 140 | num=512, 141 | pos_fraction=0.25, 142 | neg_pos_ub=-1, 143 | add_gt_as_proposals=True), 144 | mask_size=28, 145 | pos_weight=-1, 146 | debug=False), 147 | dict( 148 | assigner=dict( 149 | type='MaxIoUAssigner', 150 | pos_iou_thr=0.6, 151 | neg_iou_thr=0.6, 152 | min_pos_iou=0.6, 153 | ignore_iof_thr=-1), 154 | sampler=dict( 155 | type='RandomSampler', 156 | num=512, 157 | pos_fraction=0.25, 158 | neg_pos_ub=-1, 159 | add_gt_as_proposals=True), 160 | mask_size=28, 161 | pos_weight=-1, 162 | debug=False), 163 | dict( 164 | assigner=dict( 165 | type='MaxIoUAssigner', 166 | pos_iou_thr=0.7, 167 | neg_iou_thr=0.7, 168 | min_pos_iou=0.7, 169 | ignore_iof_thr=-1), 170 | sampler=dict( 171 | type='RandomSampler', 172 | num=512, 173 | pos_fraction=0.25, 174 | neg_pos_ub=-1, 175 | add_gt_as_proposals=True), 176 | mask_size=28, 177 | pos_weight=-1, 178 | debug=False) 179 | ], 180 | stage_loss_weights=[1, 0.5, 0.25]) 181 | test_cfg = dict( 182 | rpn=dict( 183 | nms_across_levels=False, 184 | nms_pre=1000, 185 | nms_post=1000, 186 | max_num=1000, 187 | nms_thr=0.7, 188 | min_bbox_size=0), 189 | rcnn=dict( 190 | score_thr=0.05, 191 | nms=dict(type='nms', iou_thr=0.5), 192 | max_per_img=100, 193 | mask_thr_binary=0.5)) 194 | # dataset settings 195 | dataset_type = 'CocoDataset' 196 | data_root = '/content/drive/My Drive/Mmdetection/' 197 | img_norm_cfg = dict( 198 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 199 | train_pipeline = [ 200 | dict(type='LoadImageFromFile'), 201 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 202 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 203 | dict(type='RandomFlip', flip_ratio=0.5), 204 | dict(type='Normalize', **img_norm_cfg), 205 | dict(type='Pad', size_divisor=32), 206 | dict(type='DefaultFormatBundle'), 207 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), 208 | ] 209 | test_pipeline = [ 210 | dict(type='LoadImageFromFile'), 211 | dict( 212 | type='MultiScaleFlipAug', 213 | img_scale=(1333, 800), 214 | flip=False, 215 | transforms=[ 216 | dict(type='Resize', keep_ratio=True), 217 | dict(type='RandomFlip'), 218 | dict(type='Normalize', **img_norm_cfg), 219 | dict(type='Pad', size_divisor=32), 220 | dict(type='ImageToTensor', keys=['img']), 221 | dict(type='Collect', keys=['img']), 222 | ]) 223 | ] 224 | data = dict( 225 | imgs_per_gpu=2, 226 | workers_per_gpu=2, 227 | train=dict( 228 | type=dataset_type, 229 | ann_file='/content/drive/My Drive/chunk.json', 230 | img_prefix='/content/drive/My Drive/chunk_images/', 231 | pipeline=train_pipeline), 232 | val=dict( 233 | type=dataset_type, 234 | ann_file=data_root + 'VOC2007/test.json', 235 | img_prefix=data_root + 'VOC2007/Test/', 236 | pipeline=test_pipeline), 237 | test=dict( 238 | type=dataset_type, 239 | ann_file=data_root + 'VOC2007/test.json', 240 | img_prefix=data_root + 'VOC2007/Test/', 241 | pipeline=test_pipeline)) 242 | # evaluation = dict(interval=1, metric=['bbox']) 243 | # optimizer 244 | optimizer = dict(type='SGD', lr=0.0012, momentum=0.9, weight_decay=0.0001) 245 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 246 | # learning policy 247 | lr_config = dict( 248 | policy='step', 249 | warmup='linear', 250 | warmup_iters=500, 251 | warmup_ratio=1.0 / 3, 252 | step=[16, 19]) 253 | checkpoint_config = dict(interval=1,create_symlink=False) 254 | # yapf:disable 255 | log_config = dict( 256 | interval=50, 257 | hooks=[ 258 | dict(type='TextLoggerHook'), 259 | # dict(type='TensorboardLoggerHook') 260 | ]) 261 | # yapf:enable 262 | # runtime settings 263 | total_epochs = 36 264 | dist_params = dict(backend='nccl') 265 | log_level = 'INFO' 266 | work_dir = '/content/drive/My Drive/Mmdetection/new_chunk_cascade_mask_rcnn_hrnetv2p_w32_20e' 267 | load_from = None 268 | resume_from = '/content/drive/My Drive/Mmdetection/new_chunk_cascade_mask_rcnn_hrnetv2p_w32_20e/epoch_30.pth' 269 | workflow = [('train', 1)] 270 | -------------------------------------------------------------------------------- /CascadeTab/Functions/blessFunc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from Functions.borderFunc import extract_table 4 | import lxml.etree as etree 5 | import os 6 | 7 | ## Input : roi of one cell 8 | ## Output : bounding box for the text in that cell 9 | def extractTextBless(img): 10 | return_arr = [] 11 | h,w=img.shape[0:2] 12 | base_size=h+14,w+14,3 13 | img_np = np.zeros(base_size,dtype=np.uint8) 14 | cv2.rectangle(img_np,(0,0),(w+14,h+14),(255,255,255),30) 15 | img_np[7:h+7,7:w+7]=img 16 | 17 | gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) 18 | # blur = cv2.GaussianBlur(gray,(5,5),0) 19 | ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) 20 | rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 1)) 21 | dilation = cv2.dilate(thresh1, rect_kernel, iterations = 2) 22 | 23 | contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE) 24 | for cnt in (contours): 25 | if cv2.contourArea(cnt) < 20: 26 | continue 27 | x, y, w, h = cv2.boundingRect(cnt) 28 | if(h<6) or w<4 or h/img.shape[0]>0.95 or h>30: 29 | continue 30 | return_arr.append([x-7, y-7, w, h]) 31 | return return_arr 32 | 33 | ## Input : Roi of Table , Orignal Image, Cells Detected 34 | ## Output : Returns XML element which has contains bounding box of textchunks 35 | def borderless(table, image, res_cells): 36 | cells = [] 37 | x_lines = [] 38 | y_lines = [] 39 | # padding of the table 40 | print(table) 41 | table[0],table[1],table[2],table[3] = table[0]-15,table[1]-15,table[2]+15,table[3]+15 42 | print (table[0]) 43 | print (table[1]) 44 | print (table[2]) 45 | print (table[3]) 46 | 47 | # extracting the cells that might belong to that table 48 | for cell in res_cells: 49 | if cell[0]>table[0]-50 and cell[1]>table[1]-50 and cell[2]last-15) or (cell[3]temp-15): 69 | if cell[3]>temp: 70 | temp = cell[3] 71 | else: 72 | last = cell[1] 73 | if last > temp: 74 | row.append((last+temp)//2) 75 | if prev is not None: 76 | if ((last+temp)//2) < prev + 10 or ((last+temp)//2) < prev - 10: 77 | row.pop() 78 | prev = (last+temp)//2 79 | temp = cell[3] 80 | 81 | row.append(table[3]+50) 82 | i=1 83 | rows = [] 84 | for r in range(len(row)): 85 | rows.append([]) 86 | # rows creates a empty matrix with the no. of entries equal to the no. of rows in the table 87 | final_rows = rows 88 | maxr = -111 89 | # print(len(row)) 90 | #It stores all the cells to the specific rows 91 | for cell in cells: 92 | if cell[3]=prevr[0]-5) or (r[2]<=prevr[2]+5 and r[2]>=prevr[2]-5): 111 | if r[4]lasty[n][1]: 136 | lasty[n][1] = r[3] 137 | # print("last y:",lasty) 138 | 139 | # taking the mid value of the prev y2 and the cuurent y1 in a row so as to find the right y coordinate for the row line 140 | row = [] 141 | row.append(table[1]) 142 | prev = None 143 | pr = None 144 | for x in range(len(lasty)-1): 145 | if x==0 and prev==None: 146 | prev = lasty[x] 147 | else: 148 | if pr is not None: 149 | if abs(((lasty[x][0]+prev[1])//2)-pr)<=10: 150 | row.pop() 151 | row.append((lasty[x][0]+prev[1])//2) 152 | else: 153 | row.append((lasty[x][0]+prev[1])//2) 154 | else: 155 | row.append((lasty[x][0]+prev[1])//2) 156 | pr = (lasty[x][0]+prev[1])//2 157 | prev = lasty[x] 158 | row.append(table[3]) 159 | 160 | #finding the max. no. of cells in all the rows which is equal to the number of columns 161 | maxr = 0 162 | for r2 in final_rows: 163 | print(r2) 164 | if len(r2)>maxr: 165 | maxr = len(r2) 166 | 167 | 168 | lastx = [] 169 | # acc. to the x1 and x2 coordinates of each cell in a row, finding the x1 and x2 coordinates for each column 170 | for n in range(maxr): 171 | lastx.append([999999999,0]) 172 | 173 | for r2 in final_rows: 174 | if len(r2)==maxr: 175 | for n,col in enumerate(r2): 176 | # print(col) 177 | if col[2]>lastx[n][1]: 178 | lastx[n][1] = col[2] 179 | if col[0]r2[r][0]): 188 | r +=1 189 | if n != 0: 190 | if r2[r-1][0] > lastx[n-1][1]: 191 | if r2[r-1][0]lastx[n][1]: 198 | lastx[n][1] = col[2] 199 | 200 | # for each column takin the mid value of prev x2 and current x1 so as to draw x1 lines 201 | print(lastx) 202 | col = np.zeros(maxr+1) 203 | col[0] = table[0] 204 | prev = 0 205 | i = 1 206 | for x in range(len(lastx)): 207 | if x==0: 208 | prev = lastx[x] 209 | else: 210 | col[i] = (lastx[x][0]+prev[1])//2 211 | i+=1 212 | prev = lastx[x] 213 | col = col.astype(int) 214 | col[maxr] = table[2] 215 | 216 | _row_ = sorted(row, key=lambda x:x) 217 | _col_ = sorted(col, key=lambda x:x) 218 | 219 | #drawing lines acc. to the values(drawing the row and the column lines) 220 | for no,c in enumerate(_col_): 221 | x_lines.append([c,table[1],c,table[3]]) 222 | cv2.line(im2,(c,table[1]),(c,table[3]),(255,0,0),1) 223 | for no,c in enumerate(_row_): 224 | y_lines.append([table[0],c,table[2],c]) 225 | cv2.line(im2,(table[0],c),(table[2],c),(255,0,0),1) 226 | 227 | # cv2_imshow(im2) 228 | print("table:",table) 229 | # for r in row: 230 | # cv2.line(im2,(r,table[1]),(r,table[3]),(0,255,0),1) 231 | # for c in col: 232 | # cv2.line(im2,(c,table[1]),(c,table[3]),(0,255,0),1) 233 | final = extract_table(image[table[1]:table[3],table[0]:table[2]],0,(y_lines,x_lines)) 234 | 235 | cellBoxes = [] 236 | img4 = image.copy() 237 | for box in final: 238 | cellBox = extractTextBless(image[box[1]:box[3],box[0]:box[4]]) 239 | for cell in cellBox: 240 | cellBoxes.append([box[0]+cell[0], box[1]+cell[1], cell[2], cell[3]]) 241 | cv2.rectangle(img4, (box[0]+cell[0], box[1]+cell[1]), (box[0]+cell[0]+cell[2], box[1]+cell[1]+cell[3]), (255,0,0), 2) 242 | 243 | # cv2_imshow(img4) 244 | 245 | the_last_y = -1 246 | cellBoxes = sorted(cellBoxes,key=lambda x: x[1]) 247 | cellBoxes2BeMerged = [] 248 | cellBoxes2BeMerged.append([]) 249 | rowCnt = 0 250 | for cell in cellBoxes: 251 | if(the_last_y == -1): 252 | the_last_y = cell[1] 253 | cellBoxes2BeMerged[rowCnt].append(cell) 254 | continue 255 | if(abs(cell[1]-the_last_y) < 8): 256 | cellBoxes2BeMerged[rowCnt].append(cell) 257 | else: 258 | the_last_y=cell[1] 259 | rowCnt+=1 260 | cellBoxes2BeMerged.append([]) 261 | cellBoxes2BeMerged[rowCnt].append(cell) 262 | 263 | MergedBoxes = [] 264 | for cellrow in cellBoxes2BeMerged: 265 | cellrow = sorted(cellrow,key=lambda x: x[0]) 266 | cur_cell = -1 267 | for c,cell in enumerate(cellrow): 268 | if(cur_cell == -1): 269 | cur_cell = cell 270 | continue 271 | if(len(cellrow)==1): 272 | MergedBoxes.append(cell) 273 | break 274 | if(abs((cur_cell[0]+cur_cell[2])-cell[0]) < 10): 275 | cur_cell[2] = cur_cell[2] + cell[2] + (cell[0]- (cur_cell[0]+cur_cell[2])) 276 | if(cur_cell[3]= box[0] and mbox[1] >= box[1] and mbox[2] <= box[4] and mbox[3] <= box[3]): 305 | if(len(tcurcell) == 0): 306 | tcurcell = mbox 307 | else: 308 | if(mbox[0] < tcurcell[0]): 309 | tcurcell[0] = mbox[0] 310 | if(mbox[1] < tcurcell[1]): 311 | tcurcell[1] = mbox[1] 312 | if(mbox[2] > tcurcell[2]): 313 | tcurcell[2] = mbox[2] 314 | if(mbox[3] > tcurcell[3]): 315 | tcurcell[3] = mbox[3] 316 | 317 | for i,frow in enumerate(final_rows): 318 | for j,fbox in enumerate(frow): 319 | if(fbox[0] >= box[0] and fbox[0] <= box[4] and fbox[1] >= box[1] and fbox[1] <= box[3]): 320 | mcurcell = fbox 321 | final_rows[i].pop(j) 322 | break 323 | 324 | if(abs(ycnt-box[1])>10): 325 | rcnt+=1 326 | TextChunks.append([]) 327 | ycnt = box[1] 328 | 329 | if(len(tcurcell)==0): 330 | if(len(mcurcell)==0): 331 | continue 332 | else: 333 | TextChunks[rcnt].append(mcurcell) 334 | else: 335 | if(len(mcurcell)==0): 336 | TextChunks[rcnt].append(tcurcell) 337 | else: 338 | if(abs(mcurcell[0] - tcurcell[0])<=20 and abs(mcurcell[1] - tcurcell[1])<=20 and abs(mcurcell[2] - tcurcell[2])<=20 and abs(mcurcell[3] - tcurcell[3])<=20): 339 | TextChunks[rcnt].append(tcurcell) 340 | elif((abs(mcurcell[0] - tcurcell[0])<=20 and abs(mcurcell[2] - tcurcell[2])<=20) or (abs(mcurcell[1] - tcurcell[1])<=20 or abs(mcurcell[3] - tcurcell[3])<=20)): 341 | TextChunks[rcnt].append(mcurcell) 342 | else: 343 | TextChunks[rcnt].append(tcurcell) 344 | 345 | colors = [(255,0,0),(0,255,0),(0,0,255),(125,125,0),(0,255,255)] 346 | for no,r in enumerate(TextChunks): 347 | for tbox in r: 348 | cv2.rectangle(im2, (tbox[0], tbox[1]), (tbox[2], tbox[3]), colors[no%len(colors)], 1) 349 | # print(tbox) 350 | # cv2_imshow("text chunks", im2) 351 | # cv2.waitKey(0) 352 | 353 | def rowstart(val): 354 | r = 0 355 | while(val > _row_[r]): 356 | r += 1 357 | if r-1 == -1: 358 | return r 359 | else: 360 | return r-1 361 | 362 | def rowend(val): 363 | r = 0 364 | while(val > _row_[r]): 365 | r += 1 366 | if r-1 == -1: 367 | return r 368 | else: 369 | return r-1 370 | 371 | def colstart(val): 372 | r = 0 373 | while(r < len(_col_) and val > _col_[r]): 374 | r += 1 375 | if r-1 == -1: 376 | return r 377 | else: 378 | return r-1 379 | 380 | def colend(val): 381 | r = 0 382 | while(r < len(_col_) and val > _col_[r]): 383 | r += 1 384 | if r-1 == -1: 385 | return r 386 | else: 387 | return r-1 388 | 389 | tableXML = etree.Element("table") 390 | Tcoords = etree.Element("Coords", points=str(table[0])+","+str(table[1])+" "+str(table[0])+","+str(table[3])+" "+str(table[2])+","+str(table[3])+" "+str(table[2])+","+str(table[1])) 391 | tableXML.append(Tcoords) 392 | for final in TextChunks: 393 | for box in final: 394 | cell = etree.Element("cell") 395 | end_col,end_row,start_col,start_row = colend(box[2]),rowend(box[3]),colstart(box[0]),rowstart(box[1]) 396 | cell.set("end-col",str(end_col)) 397 | cell.set("end-row",str(end_row)) 398 | cell.set("start-col",str(start_col)) 399 | cell.set("start-row",str(start_row)) 400 | 401 | # print(cellBox) 402 | one = str(box[0])+","+str(box[1]) 403 | two = str(box[0])+","+str(box[3]) 404 | three = str(box[2])+","+str(box[3]) 405 | four = str(box[2])+","+str(box[1]) 406 | # print(one) 407 | coords = etree.Element("Coords", points=one+" "+two+" "+three+" "+four) 408 | 409 | cell.append(coords) 410 | tableXML.append(cell) 411 | 412 | return tableXML 413 | -------------------------------------------------------------------------------- /CascadeTab/Functions/borderFunc.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from Functions.line_detection import line_detection 3 | import os 4 | 5 | ################## Functions required for Border table Recognition ################# 6 | 7 | ## Return the intersection of lines only if intersection is present ## 8 | # Input : x1, y1, x2, y2, x3, y3, x4, y4 (1: vertical, 2: horizontal) 9 | # Output : (x,y) Intersection point 10 | def line_intersection(x1, y1, x2, y2, x3, y3, x4, y4): 11 | # print(x1, y1, x2, y2) 12 | # print(x3, y3, x4, y4) 13 | 14 | if((x1>= x3-5 or x1>= x3+5) and (x1 <= x4+5 or x1 <= x4-5) and (y3+8>=min(y1,y2) or y3-5>=min(y1,y2)) and y3<=max(y1,y2)+5): 15 | return x1,y3 16 | 17 | 18 | ## main extraction function ## 19 | # Input : Image, Decision parameter(1/0),lines for borderless (only of decision parameter is 0) 20 | # Output : Array of cells 21 | def extract_table(table_body,__line__,lines=None): 22 | # Deciding variable 23 | #print (table_body) 24 | print (__line__) 25 | if(__line__ == 1 ): 26 | # Check if table image is bordered or borderless 27 | temp_lines_hor, temp_lines_ver = line_detection(table_body) 28 | print ("temp_lines_hor",temp_lines_hor) 29 | print ("temp_lines_ver",temp_lines_ver) 30 | else: 31 | temp_lines_hor, temp_lines_ver = lines 32 | 33 | if (temp_lines_hor is None) or (temp_lines_ver is None): 34 | print("Either Horizontal Or Vertical Lines Not Detected") 35 | return None 36 | 37 | table = table_body.copy() 38 | x = 0 39 | y = 0 40 | k = 0 41 | points = [] 42 | print("[Table status] : Processing table with lines") 43 | # Remove same lines detected closer 44 | for x1, y1, x2, y2 in temp_lines_ver: 45 | point = [] 46 | for x3, y3, x4, y4 in temp_lines_hor: 47 | try: 48 | k += 1 49 | x, y = line_intersection(x1, y1, x2, y2, x3, y3, x4, y4) 50 | point.append([x, y]) 51 | except: 52 | continue 53 | points.append(point) 54 | 55 | for point in points: 56 | for x,y in point: 57 | cv2.line(table,(x,y),(x,y),(0,0,255),8) 58 | 59 | '''directory = '/content/drive/My Drive/Optum/Dataset/images' 60 | os.chdir(directory) 61 | 62 | filename = "intersection.jpg" 63 | cv2.imwrite(filename, table)''' 64 | # cv2_imshow("intersection",table) 65 | # cv2.waitKey(0) 66 | 67 | # boxno = -1 68 | box = [] 69 | flag = 1 70 | lastCache = [] 71 | ## creating bounding boxes of cells from the points detected 72 | ## This is still under work and might fail on some images 73 | for i, row in enumerate(points): 74 | limitj = len(row) 75 | currentVala = [] 76 | for j, col in enumerate(row): 77 | 78 | if (j == limitj-1): 79 | break 80 | if (i == 0): 81 | nextcol = row[j+1] 82 | lastCache.append([col[0], col[1], nextcol[0], nextcol[1],9999,9999,9999,9999]) 83 | else: 84 | nextcol = row[j+1] 85 | currentVala.append([col[0], col[1], nextcol[0], nextcol[1], 9999, 9999, 9999, 9999]) 86 | # Matching 87 | flag = 1 88 | index = [] 89 | for k, last in enumerate(lastCache): 90 | 91 | if (col[1] == last[1]) and lastCache[k][4] == 9999: 92 | lastCache[k][4] = col[0] 93 | lastCache[k][5] = col[1] 94 | if lastCache[k][4] != 9999 and lastCache[k][6] != 9999: 95 | box.append(lastCache[k]) 96 | index.append(k) 97 | flag = 1 98 | 99 | if (nextcol[1] == last[3]) and lastCache[k][6] == 9999: 100 | lastCache[k][6] = nextcol[0] 101 | lastCache[k][7] = nextcol[1] 102 | if lastCache[k][4] != 9999 and lastCache[k][6] != 9999: 103 | box.append(lastCache[k]) 104 | index.append(k) 105 | flag = 1 106 | 107 | if len(lastCache) !=0: 108 | if lastCache[k][4] == 9999 or lastCache[k][6] == 9999: 109 | flag = 0 110 | # print(index) 111 | for k in index: 112 | lastCache.pop(k) 113 | # tranfsering 114 | if flag == 0: 115 | for last in lastCache: 116 | if last[4] == 9999 or last[6] == 9999: 117 | currentVala.append(last) 118 | 119 | if(i!=0): 120 | lastCache = currentVala 121 | 122 | ## Visualizing the cells ## 123 | count = 1 124 | for i in box: 125 | cv2.rectangle(table_body, (i[0], i[1]), (i[6], i[7]), (int(i[7]%255),0,int(i[0]%255)), 2) 126 | count+=1 127 | 128 | '''directory = '/content/drive/My Drive/Optum/Dataset/images' 129 | os.chdir(directory) 130 | 131 | filename = "cells.jpg" 132 | cv2.imwrite(filename, table_body)''' 133 | # cv2_imshow("cells",table_body) 134 | # cv2.waitKey(0) 135 | ############################ 136 | return box 137 | # extract_table(cv2.imread("E:\\KSK\\KSK ML\\KSK PAPERS\\TabXNet\\For Git\\images\\table.PNG"),1,lines=None) 138 | 139 | 140 | def findX(X,x): 141 | return X.index(x) 142 | def findY(Y,y): 143 | return Y.index(y) 144 | 145 | def span(box,X,Y): 146 | start_col = findX(X,box[0]) ## x1 147 | end_col = findX(X,box[4])-1 ## x3 148 | start_row = findY(Y,box[1]) ## y1 149 | end_row = findY(Y,box[3])-1 ## y2 150 | # print(end_col,end_row,start_col,start_row) 151 | return end_col,end_row,start_col,start_row 152 | 153 | 154 | 155 | def extractText(img): 156 | gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 157 | _, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) 158 | # cv2_imshow(thresh1) 159 | rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1)) 160 | dilation = cv2.dilate(thresh1, rect_kernel, iterations = 2) 161 | contours, _ = cv2.findContours(dilation, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE) 162 | im2 = img.copy() 163 | mx,my,mw,mh = float('Inf'),float('Inf'),-1,-1 164 | for cnt in contours: 165 | x, y, w, h = cv2.boundingRect(cnt) 166 | # print(im2.shape) 167 | if x<2 or y<2 or (x+w>=im2.shape[1]-1 and y+h>=im2.shape[0]-1) or w>=im2.shape[1]-1 or h>=im2.shape[0]-1: 168 | continue 169 | if xmw: 174 | mw = x+w 175 | if y+h>mh: 176 | mh = y+h 177 | # print(x, y, w, h) 178 | 179 | if mx !=float('Inf') and my !=float('Inf'): 180 | # Drawing a rectangle on copied image 181 | # rect = cv2.rectangle(im2, (mx+1, my), (mw-2, mh-2), (0, 255, 0), 1) 182 | # cv2_imshow(im2) 183 | return mx,my,mw,mh 184 | else : 185 | return None -------------------------------------------------------------------------------- /CascadeTab/Functions/line_detection.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import os 4 | import random 5 | 6 | # Input : Image 7 | # Output : hor,ver 8 | def line_detection(image): 9 | 10 | gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 11 | bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, 1) 12 | bw = cv2.bitwise_not(bw) 13 | 14 | '''directory = '/content/drive/My Drive/Optum/Dataset/images' 15 | os.chdir(directory) 16 | n = random.random() 17 | print ("Random number generated is",n) 18 | 19 | filename = str(n) + "after_threshold.jpg" 20 | cv2.imwrite(filename, bw)''' 21 | ## To visualize image after thresholding ## 22 | # cv2_imshow("bw",bw) 23 | # cv2.waitKey(0) 24 | ########################################### 25 | horizontal = bw.copy() 26 | vertical = bw.copy() 27 | img = image.copy() 28 | # [horizontal lines] 29 | # Create structure element for extracting horizontal lines through morphology operations 30 | horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 1)) 31 | 32 | # Apply morphology operations 33 | horizontal = cv2.erode(horizontal, horizontalStructure) 34 | horizontal = cv2.dilate(horizontal, horizontalStructure) 35 | 36 | horizontal = cv2.dilate(horizontal, (1,1), iterations=5) 37 | horizontal = cv2.erode(horizontal, (1,1), iterations=5) 38 | 39 | '''directory = '/content/drive/My Drive/Optum/Dataset/images' 40 | os.chdir(directory) 41 | n = random.random() 42 | print ("Random number generated is",n) 43 | 44 | filename = str(n) + "hor_lines_highlighted.jpg" 45 | cv2.imwrite(filename, horizontal)''' 46 | 47 | ## Uncomment to visualize highlighted Horizontal lines 48 | # cv2_imshow("horizontal",horizontal) 49 | # cv2.waitKey(0) 50 | 51 | # HoughlinesP function to detect horizontal lines 52 | hor_lines = cv2.HoughLinesP(horizontal,rho=1,theta=np.pi/180,threshold=100,minLineLength=30,maxLineGap=3) 53 | if hor_lines is None: 54 | return None,None 55 | temp_line = [] 56 | for line in hor_lines: 57 | for x1,y1,x2,y2 in line: 58 | temp_line.append([x1,y1-5,x2,y2-5]) 59 | 60 | # Sorting the list of detected lines by Y1 61 | hor_lines = sorted(temp_line,key=lambda x: x[1]) 62 | print ("hor_lines",hor_lines) 63 | 64 | 65 | ## Uncomment this part to visualize the lines detected on the image ## 66 | print(len(hor_lines)) 67 | for x1, y1, x2, y2 in hor_lines: 68 | cv2.line(image, (x1,y1), (x2,y2), (0, 255, 0), 1) 69 | 70 | 71 | '''directory = '/content/drive/My Drive/Optum/Dataset/images' 72 | os.chdir(directory) 73 | n = random.random() 74 | print ("Random number generated is",n) 75 | 76 | filename = str(n) + "hor_lines_detected.jpg" 77 | cv2.imwrite(filename, image)''' 78 | # print(image.shape) 79 | # cv2_imshow("image",image) 80 | # cv2.waitKey(0) 81 | #################################################################### 82 | 83 | ## Selection of best lines from all the horizontal lines detected ## 84 | lasty1 = -111111 85 | lines_x1 = [] 86 | lines_x2 = [] 87 | hor = [] 88 | i=0 89 | for x1,y1,x2,y2 in hor_lines: 90 | if y1 >= lasty1 and y1 <= lasty1 + 10: 91 | lines_x1.append(x1) 92 | lines_x2.append(x2) 93 | else: 94 | if (i != 0 and len(lines_x1) is not 0): 95 | hor.append([min(lines_x1),lasty1,max(lines_x2),lasty1]) 96 | lasty1 = y1 97 | lines_x1 = [] 98 | lines_x2 = [] 99 | lines_x1.append(x1) 100 | lines_x2.append(x2) 101 | i+=1 102 | hor.append([min(lines_x1),lasty1,max(lines_x2),lasty1]) 103 | ##################################################################### 104 | 105 | 106 | # [vertical lines] 107 | # Create structure element for extracting vertical lines through morphology operations 108 | verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 15)) 109 | 110 | # Apply morphology operations 111 | vertical = cv2.erode(vertical, verticalStructure) 112 | vertical = cv2.dilate(vertical, verticalStructure) 113 | 114 | vertical = cv2.dilate(vertical, (1,1), iterations=8) 115 | vertical = cv2.erode(vertical, (1,1), iterations=7) 116 | 117 | ######## Preprocessing Vertical Lines ############### 118 | # cv2_imshow("vertical",vertical) 119 | # cv2.waitKey(0) 120 | ##################################################### 121 | 122 | # HoughlinesP function to detect vertical lines 123 | # ver_lines = cv2.HoughLinesP(vertical,rho=1,theta=np.pi/180,threshold=20,minLineLength=20,maxLineGap=2) 124 | ver_lines = cv2.HoughLinesP(vertical, 1, np.pi/180, 20, np.array([]), 20, 2) 125 | if ver_lines is None: 126 | return None,None 127 | temp_line = [] 128 | for line in ver_lines: 129 | for x1,y1,x2,y2 in line: 130 | temp_line.append([x1,y1,x2,y2]) 131 | 132 | # Sorting the list of detected lines by X1 133 | ver_lines = sorted(temp_line,key=lambda x: x[0]) 134 | print ("ver_lines",ver_lines) 135 | ## Uncomment this part to visualize the lines detected on the image ## 136 | print(len(ver_lines)) 137 | for x1, y1, x2, y2 in ver_lines: 138 | cv2.line(image, (x1,y1-5), (x2,y2-5), (0, 255, 0), 1) 139 | 140 | 141 | '''directory = '/content/drive/My Drive/Optum/Dataset/images' 142 | os.chdir(directory) 143 | n = random.random() 144 | print ("Random number generated is",n) 145 | 146 | filename = str(n) + "ver_lines_detected.jpg" 147 | cv2.imwrite(filename, image)''' 148 | 149 | # print(image.shape) 150 | # cv2_imshow("image",image) 151 | # cv2.waitKey(0) 152 | #################################################################### 153 | 154 | ## Selection of best lines from all the vertical lines detected ## 155 | lastx1 = -111111 156 | lines_y1 = [] 157 | lines_y2 = [] 158 | ver = [] 159 | count = 0 160 | lasty1 = -11111 161 | lasty2 = -11111 162 | for x1,y1,x2,y2 in ver_lines: 163 | if x1 >= lastx1 and x1 <= lastx1 + 15 and not (((min(y1,y2)table[0]-5 and box[1]>table[1]-5 and box[2].85: 47 | res_border.append(r[:4].astype(int)) 48 | 49 | ## for cells 50 | for r in result[0][1]: 51 | print ("2.",r[4]) 52 | if r[4]>.85: 53 | r[4] = r[4]*100 54 | res_cell.append(r.astype(int)) 55 | 56 | ## for borderless 57 | for r in result[0][2]: 58 | print ("3.",r[4]) 59 | if r[4]>.85: 60 | res_bless.append(r[:4].astype(int)) 61 | 62 | print ("res_border",res_border) 63 | print ("res_cell",res_cell) 64 | print ("res_bless",res_bless) 65 | 66 | ## if border tables detected 67 | if len(res_border) != 0: 68 | ## call border script for each table in image 69 | for res in res_border: 70 | try: 71 | root.append(border(res,cv2.imread(i))) 72 | except: 73 | pass 74 | if len(res_bless) != 0: 75 | if len(res_cell) != 0: 76 | for no,res in enumerate(res_bless): 77 | root.append(borderless(res,cv2.imread(i),res_cell)) 78 | 79 | myfile = open(xmlPath+i.split('/')[-1][:-3]+'xml', "w") 80 | myfile.write('\n') 81 | myfile.write(etree.tostring(root, pretty_print=True,encoding="unicode")) 82 | myfile.close() -------------------------------------------------------------------------------- /Data Preparation/Dilation.py: -------------------------------------------------------------------------------- 1 | # Note: Image name will be stored as "Dilation_OriginalName" to avoid confict 2 | 3 | import cv2 4 | import glob 5 | import numpy as np 6 | 7 | # DEFINE THE PATH 8 | print ("Entered") 9 | PATH_TO_DEST = "/home/prakhar/try/Dilated Image/" 10 | PATH_TO_ORIGIAL_IMAGES = "/home/prakhar/PublayNet/val/publaynet/val/" 11 | 12 | # if the source directory have other files than images, use extenstion of image 13 | # to get the files ( for example *.png ) 14 | img_files = glob.glob(PATH_TO_ORIGIAL_IMAGES+"*.*") 15 | total = len(img_files) 16 | 17 | # 2x2 Static kernal 18 | kernal = np.ones((2,2),np.uint8) 19 | 20 | for count,i in enumerate(img_files): 21 | print (count) 22 | print (i) 23 | image_name = i.split("/")[-1] 24 | print("Progress : ",count,"/",total) 25 | img = cv2.imread(i,0) 26 | _, mask = cv2.threshold(img,220,255,cv2.THRESH_BINARY_INV) 27 | dst = cv2.dilate(mask,kernal,iterations = 1) 28 | dst = cv2.bitwise_not(dst) 29 | cv2.imwrite(PATH_TO_DEST+"/Dilation_"+image_name,dst) 30 | -------------------------------------------------------------------------------- /Data Preparation/Images/3img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Prakhar-97/Table-detection-and-Document-layout-analysis/bfcd189ee9edb603b734cd07d965a7400b85f820/Data Preparation/Images/3img.png -------------------------------------------------------------------------------- /Data Preparation/Smudge.py: -------------------------------------------------------------------------------- 1 | # Note: Image name will be stored as "Smudge_OriginalName" to avoid confict 2 | import cv2 3 | import numpy as np 4 | import glob 5 | 6 | def basicTransform(img): 7 | _, mask = cv2.threshold(img,220,255,cv2.THRESH_BINARY_INV) 8 | img = cv2.bitwise_not(mask) 9 | return img 10 | 11 | PATH_TO_DEST = "/home/prakhar/try/Smudged Image/" 12 | PATH_TO_ORIGIAL_IMAGES = "/home/prakhar/PublayNet/val/publaynet/val/" 13 | 14 | img_files = glob.glob(PATH_TO_ORIGIAL_IMAGES+"*.*") 15 | 16 | total = len(img_files) 17 | for count,i in enumerate(img_files): 18 | image_name = i.split("/")[-1] 19 | print("Progress : ",count,"/",total) 20 | img = cv2.imread(i) 21 | 22 | # Split the 3 channels into Blue,Green and Red 23 | b,g,r = cv2.split(img) 24 | 25 | # Apply Basic Transformation 26 | b = basicTransform(b) 27 | r = basicTransform(r) 28 | g = basicTransform(g) 29 | 30 | # Perform the distance transform algorithm 31 | b = cv2.distanceTransform(b, cv2.DIST_L2, 5) # ELCUDIAN 32 | g = cv2.distanceTransform(g, cv2.DIST_L1, 5) # LINEAR 33 | r = cv2.distanceTransform(r, cv2.DIST_C, 5) # MAX 34 | 35 | # Normalize 36 | r = cv2.normalize(r, r, 0, 1.0, cv2.NORM_MINMAX) 37 | g = cv2.normalize(g, g, 0, 1.0, cv2.NORM_MINMAX) 38 | b = cv2.normalize(b, b, 0, 1.0, cv2.NORM_MINMAX) 39 | 40 | # Merge the channels 41 | dist = cv2.merge((b,g,r)) 42 | dist = cv2.normalize(dist,dist, 0, 4.0, cv2.NORM_MINMAX) 43 | dist = cv2.cvtColor(dist, cv2.COLOR_BGR2GRAY) 44 | 45 | # In order to save as jpg, or png, we need to handle the Data 46 | # format of image 47 | data = dist.astype(np.float64) / 4.0 48 | data = 1800 * data # Now scale by 1800 49 | dist = data.astype(np.uint16) 50 | 51 | # Save to destination 52 | cv2.imwrite(PATH_TO_DEST+"/Smudge_"+image_name,dist) 53 | -------------------------------------------------------------------------------- /Document layout analysis/Config file/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py: -------------------------------------------------------------------------------- 1 | model = dict( 2 | type='CascadeRCNN', 3 | pretrained='open-mmlab://msra/hrnetv2_w32', 4 | backbone=dict( 5 | type='HRNet', 6 | extra=dict( 7 | stage1=dict( 8 | num_modules=1, 9 | num_branches=1, 10 | block='BOTTLENECK', 11 | num_blocks=(4, ), 12 | num_channels=(64, )), 13 | stage2=dict( 14 | num_modules=1, 15 | num_branches=2, 16 | block='BASIC', 17 | num_blocks=(4, 4), 18 | num_channels=(32, 64)), 19 | stage3=dict( 20 | num_modules=4, 21 | num_branches=3, 22 | block='BASIC', 23 | num_blocks=(4, 4, 4), 24 | num_channels=(32, 64, 128)), 25 | stage4=dict( 26 | num_modules=3, 27 | num_branches=4, 28 | block='BASIC', 29 | num_blocks=(4, 4, 4, 4), 30 | num_channels=(32, 64, 128, 256)))), 31 | neck=dict(type='HRFPN', in_channels=[32, 64, 128, 256], out_channels=256), 32 | rpn_head=dict( 33 | type='RPNHead', 34 | in_channels=256, 35 | feat_channels=256, 36 | anchor_generator=dict( 37 | type='AnchorGenerator', 38 | scales=[8], 39 | ratios=[0.5, 1.0, 2.0], 40 | strides=[4, 8, 16, 32, 64]), 41 | bbox_coder=dict( 42 | type='DeltaXYWHBBoxCoder', 43 | target_means=[0.0, 0.0, 0.0, 0.0], 44 | target_stds=[1.0, 1.0, 1.0, 1.0]), 45 | loss_cls=dict( 46 | type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), 47 | loss_bbox=dict( 48 | type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=1.0)), 49 | roi_head=dict( 50 | type='CascadeRoIHead', 51 | num_stages=3, 52 | stage_loss_weights=[1, 0.5, 0.25], 53 | bbox_roi_extractor=dict( 54 | type='SingleRoIExtractor', 55 | roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0), 56 | out_channels=256, 57 | featmap_strides=[4, 8, 16, 32]), 58 | bbox_head=[ 59 | dict( 60 | type='Shared2FCBBoxHead', 61 | in_channels=256, 62 | fc_out_channels=1024, 63 | roi_feat_size=7, 64 | num_classes=80, 65 | bbox_coder=dict( 66 | type='DeltaXYWHBBoxCoder', 67 | target_means=[0.0, 0.0, 0.0, 0.0], 68 | target_stds=[0.1, 0.1, 0.2, 0.2]), 69 | reg_class_agnostic=True, 70 | loss_cls=dict( 71 | type='CrossEntropyLoss', 72 | use_sigmoid=False, 73 | loss_weight=1.0), 74 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0, 75 | loss_weight=1.0)), 76 | dict( 77 | type='Shared2FCBBoxHead', 78 | in_channels=256, 79 | fc_out_channels=1024, 80 | roi_feat_size=7, 81 | num_classes=80, 82 | bbox_coder=dict( 83 | type='DeltaXYWHBBoxCoder', 84 | target_means=[0.0, 0.0, 0.0, 0.0], 85 | target_stds=[0.05, 0.05, 0.1, 0.1]), 86 | reg_class_agnostic=True, 87 | loss_cls=dict( 88 | type='CrossEntropyLoss', 89 | use_sigmoid=False, 90 | loss_weight=1.0), 91 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0, 92 | loss_weight=1.0)), 93 | dict( 94 | type='Shared2FCBBoxHead', 95 | in_channels=256, 96 | fc_out_channels=1024, 97 | roi_feat_size=7, 98 | num_classes=80, 99 | bbox_coder=dict( 100 | type='DeltaXYWHBBoxCoder', 101 | target_means=[0.0, 0.0, 0.0, 0.0], 102 | target_stds=[0.033, 0.033, 0.067, 0.067]), 103 | reg_class_agnostic=True, 104 | loss_cls=dict( 105 | type='CrossEntropyLoss', 106 | use_sigmoid=False, 107 | loss_weight=1.0), 108 | loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) 109 | ], 110 | mask_roi_extractor=dict( 111 | type='SingleRoIExtractor', 112 | roi_layer=dict(type='RoIAlign', out_size=14, sample_num=0), 113 | out_channels=256, 114 | featmap_strides=[4, 8, 16, 32]), 115 | mask_head=dict( 116 | type='FCNMaskHead', 117 | num_convs=4, 118 | in_channels=256, 119 | conv_out_channels=256, 120 | num_classes=80, 121 | loss_mask=dict( 122 | type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)))) 123 | train_cfg = dict( 124 | rpn=dict( 125 | assigner=dict( 126 | type='MaxIoUAssigner', 127 | pos_iou_thr=0.7, 128 | neg_iou_thr=0.3, 129 | min_pos_iou=0.3, 130 | match_low_quality=True, 131 | ignore_iof_thr=-1), 132 | sampler=dict( 133 | type='RandomSampler', 134 | num=256, 135 | pos_fraction=0.5, 136 | neg_pos_ub=-1, 137 | add_gt_as_proposals=False), 138 | allowed_border=0, 139 | pos_weight=-1, 140 | debug=False), 141 | rpn_proposal=dict( 142 | nms_across_levels=False, 143 | nms_pre=2000, 144 | nms_post=2000, 145 | max_num=2000, 146 | nms_thr=0.7, 147 | min_bbox_size=0), 148 | rcnn=[ 149 | dict( 150 | assigner=dict( 151 | type='MaxIoUAssigner', 152 | pos_iou_thr=0.5, 153 | neg_iou_thr=0.5, 154 | min_pos_iou=0.5, 155 | match_low_quality=False, 156 | ignore_iof_thr=-1), 157 | sampler=dict( 158 | type='RandomSampler', 159 | num=512, 160 | pos_fraction=0.25, 161 | neg_pos_ub=-1, 162 | add_gt_as_proposals=True), 163 | mask_size=28, 164 | pos_weight=-1, 165 | debug=False), 166 | dict( 167 | assigner=dict( 168 | type='MaxIoUAssigner', 169 | pos_iou_thr=0.6, 170 | neg_iou_thr=0.6, 171 | min_pos_iou=0.6, 172 | match_low_quality=False, 173 | ignore_iof_thr=-1), 174 | sampler=dict( 175 | type='RandomSampler', 176 | num=512, 177 | pos_fraction=0.25, 178 | neg_pos_ub=-1, 179 | add_gt_as_proposals=True), 180 | mask_size=28, 181 | pos_weight=-1, 182 | debug=False), 183 | dict( 184 | assigner=dict( 185 | type='MaxIoUAssigner', 186 | pos_iou_thr=0.7, 187 | neg_iou_thr=0.7, 188 | min_pos_iou=0.7, 189 | match_low_quality=False, 190 | ignore_iof_thr=-1), 191 | sampler=dict( 192 | type='RandomSampler', 193 | num=512, 194 | pos_fraction=0.25, 195 | neg_pos_ub=-1, 196 | add_gt_as_proposals=True), 197 | mask_size=28, 198 | pos_weight=-1, 199 | debug=False) 200 | ]) 201 | test_cfg = dict( 202 | rpn=dict( 203 | nms_across_levels=False, 204 | nms_pre=1000, 205 | nms_post=1000, 206 | max_num=1000, 207 | nms_thr=0.7, 208 | min_bbox_size=0), 209 | rcnn=dict( 210 | score_thr=0.05, 211 | nms=dict(type='nms', iou_thr=0.5), 212 | max_per_img=100, 213 | mask_thr_binary=0.5)) 214 | dataset_type = 'CocoDataset' 215 | data_root = '/content/drive/My Drive/all_data/' 216 | classes = ('text', 'title', 'list', 'table', 'figure') 217 | img_norm_cfg = dict( 218 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 219 | train_pipeline = [ 220 | dict(type='LoadImageFromFile'), 221 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 222 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 223 | dict(type='RandomFlip', flip_ratio=0.5), 224 | dict( 225 | type='Normalize', 226 | mean=[123.675, 116.28, 103.53], 227 | std=[58.395, 57.12, 57.375], 228 | to_rgb=True), 229 | dict(type='Pad', size_divisor=32), 230 | dict(type='DefaultFormatBundle'), 231 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) 232 | ] 233 | test_pipeline = [ 234 | dict(type='LoadImageFromFile'), 235 | dict( 236 | type='MultiScaleFlipAug', 237 | img_scale=(1333, 800), 238 | flip=False, 239 | transforms=[ 240 | dict(type='Resize', keep_ratio=True), 241 | dict(type='RandomFlip'), 242 | dict( 243 | type='Normalize', 244 | mean=[123.675, 116.28, 103.53], 245 | std=[58.395, 57.12, 57.375], 246 | to_rgb=True), 247 | dict(type='Pad', size_divisor=32), 248 | dict(type='ImageToTensor', keys=['img']), 249 | dict(type='Collect', keys=['img']) 250 | ]) 251 | ] 252 | data = dict( 253 | samples_per_gpu=2, 254 | workers_per_gpu=2, 255 | train=dict( 256 | type='CocoDataset', 257 | ann_file= 258 | '/content/drive/My Drive/all_data/annotations/train_publaynet.json', 259 | img_prefix='/content/drive/My Drive/all_data/train/', 260 | classes=('text', 'title', 'list', 'table', 'figure'), 261 | pipeline=[ 262 | dict(type='LoadImageFromFile'), 263 | dict(type='LoadAnnotations', with_bbox=True, with_mask=True), 264 | dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), 265 | dict(type='RandomFlip', flip_ratio=0.5), 266 | dict( 267 | type='Normalize', 268 | mean=[123.675, 116.28, 103.53], 269 | std=[58.395, 57.12, 57.375], 270 | to_rgb=True), 271 | dict(type='Pad', size_divisor=32), 272 | dict(type='DefaultFormatBundle'), 273 | dict( 274 | type='Collect', 275 | keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) 276 | ]), 277 | val=dict( 278 | type='CocoDataset', 279 | ann_file= 280 | '/content/drive/My Drive/all_data/annotations/val_publaynet.json', 281 | img_prefix='/content/drive/My Drive/all_data/validation/', 282 | classes=('text', 'title', 'list', 'table', 'figure'), 283 | pipeline=[ 284 | dict(type='LoadImageFromFile'), 285 | dict( 286 | type='MultiScaleFlipAug', 287 | img_scale=(1333, 800), 288 | flip=False, 289 | transforms=[ 290 | dict(type='Resize', keep_ratio=True), 291 | dict(type='RandomFlip'), 292 | dict( 293 | type='Normalize', 294 | mean=[123.675, 116.28, 103.53], 295 | std=[58.395, 57.12, 57.375], 296 | to_rgb=True), 297 | dict(type='Pad', size_divisor=32), 298 | dict(type='ImageToTensor', keys=['img']), 299 | dict(type='Collect', keys=['img']) 300 | ]) 301 | ]), 302 | test=dict( 303 | type='CocoDataset', 304 | ann_file= 305 | '/content/drive/My Drive/all_data/annotations/val_publaynet.json', 306 | img_prefix='/content/drive/My Drive/all_data/validation/', 307 | classes=('text', 'title', 'list', 'table', 'figure'), 308 | pipeline=[ 309 | dict(type='LoadImageFromFile'), 310 | dict( 311 | type='MultiScaleFlipAug', 312 | img_scale=(1333, 800), 313 | flip=False, 314 | transforms=[ 315 | dict(type='Resize', keep_ratio=True), 316 | dict(type='RandomFlip'), 317 | dict( 318 | type='Normalize', 319 | mean=[123.675, 116.28, 103.53], 320 | std=[58.395, 57.12, 57.375], 321 | to_rgb=True), 322 | dict(type='Pad', size_divisor=32), 323 | dict(type='ImageToTensor', keys=['img']), 324 | dict(type='Collect', keys=['img']) 325 | ]) 326 | ])) 327 | evaluation = dict(interval=1, metric=['bbox', 'segm']) 328 | optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0001) 329 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) 330 | lr_config = dict( 331 | policy='step', 332 | warmup='linear', 333 | warmup_iters=500, 334 | warmup_ratio=0.3333333333333333, 335 | step=[16, 19]) 336 | total_epochs = 20 337 | checkpoint_config = dict(interval=1, create_symlink=False) 338 | log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')]) 339 | dist_params = dict(backend='nccl') 340 | log_level = 'INFO' 341 | load_from = None 342 | resume_from = '/content/drive/My Drive/mmdetection/tools/work_dirs/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/epoch_9.pth' 343 | workflow = [('train', 1)] 344 | work_dir = './work_dirs/cascade_mask_rcnn_hrnetv2p_w32_20e_coco' 345 | gpu_ids = range(0, 1) 346 | -------------------------------------------------------------------------------- /Document layout analysis/ICDAR_XML_to_COCO.py: -------------------------------------------------------------------------------- 1 | from pdf2image import convert_from_path 2 | from pdf2image.exceptions import (PDFInfoNotInstalledError, PDFPageCountError,PDFSyntaxError) 3 | from bs4 import BeautifulSoup as bs 4 | import glob 5 | import os 6 | import cv2 7 | import numpy as np 8 | import pprint 9 | import pickle 10 | import json 11 | 12 | pdf_path = '/home/prakhar/mmdetection/convert2pdf/all pdfs/*' 13 | xml_path = '/home/prakhar/mmdetection/convert2pdf/all xmls/*' 14 | pdfs = glob.glob(pdf_path) 15 | xmls = glob.glob(xml_path) 16 | a = 1 17 | b = 0 18 | img_list = [] 19 | ann_list = [] 20 | cate_list = [] 21 | super_dict = {} 22 | categories = {} 23 | 24 | for i in pdfs: 25 | 26 | print (i) 27 | tail_pdf = os.path.split(i) 28 | name_pdf = os.path.splitext(tail_pdf[1]) 29 | #print(tail_pdf[1]) 30 | #print (name_pdf[0]) 31 | 32 | for j in xmls : 33 | 34 | tail_xml = os.path.split(j) 35 | name_xml = os.path.splitext(tail_xml[1]) 36 | #print(tail_xml[1]) 37 | #print (name_xml[0]) 38 | 39 | if (name_pdf[0]+"-reg" == name_xml[0]): 40 | pages = convert_from_path(i) 41 | 42 | for k, page in enumerate(pages): 43 | 44 | image = {} 45 | 46 | fname = name_pdf[0] + "_page_" + str(k+1) + ".png" 47 | print (j) 48 | 49 | content = [] 50 | with open(j, 'r') as file: 51 | #import pdb ; pdb.set_trace() 52 | 53 | content = file.readlines() 54 | content ="".join(content) 55 | bs_content = bs(content, "lxml") 56 | 57 | table = bs_content.find_all("region") 58 | #print (table) 59 | coords = [] 60 | 61 | for p in table: 62 | 63 | ann = {} 64 | masks = [] 65 | num = p["page"] 66 | #print (num) 67 | #print (k+1) 68 | if num == str(k+1) : 69 | 70 | b = b+1 71 | length = len(p.contents) 72 | bbox = p.contents[length-2] 73 | 74 | x1 = int(bbox["x1"]) 75 | #print (x1) 76 | y1 = int(bbox["y1"]) 77 | w = int(bbox["x2"])-int(bbox["x1"]) 78 | #print (w) 79 | h = int(bbox["y2"])-int(bbox["y1"]) 80 | #print (h) 81 | 82 | coords = [x1, y1, w, h] 83 | mask = [x1, y1, x1, y1+h, x1+w, y1+h, x1+w, y1] 84 | masks.append(mask) 85 | 86 | ann["area"] = float(w*h) 87 | ann["bbox"] = coords 88 | ann["segmentation"] = masks 89 | ann["category_id"] = 1 90 | ann["image_id"] = a 91 | ann["id"] = b 92 | ann["iscrowd"] = 0 93 | ann["ignore"] = 0 94 | ann_list.append(ann) 95 | 96 | if len(coords) > 0 : 97 | 98 | page.save(fname, "PNG") 99 | 100 | img = cv2.imread(fname) 101 | dimensions = img.shape 102 | height = img.shape[0] 103 | width = img.shape[1] 104 | channels = img.shape[2] 105 | 106 | #print('Image Dimension : ',dimensions) 107 | #print('Image Height : ',height) 108 | #print('Image Width : ',width) 109 | #print('Number of Channels : ',channels) 110 | 111 | image["file_name"] = fname 112 | image["width"] = width 113 | image["height"] = height 114 | image["id"] = a 115 | 116 | img_list.append(image) 117 | a = a+1 118 | 119 | categories["id"] = 1 120 | categories["name"] = "table" 121 | cate_list.append(categories) 122 | 123 | super_dict["annotations"] = ann_list 124 | super_dict["categories"] = cate_list 125 | super_dict["images"] = img_list 126 | super_dict["type"] = "instances" 127 | 128 | filename = 'dataset' 129 | outfile = open(filename,'wb') 130 | pickle.dump(super_dict, outfile) 131 | outfile.close() 132 | 133 | with open("dataset.json", 'w') as outfile: 134 | json.dump(super_dict, outfile) 135 | #print (pickled_object) 136 | #unpickled_object = pickle.load(open(filename, 'rb')) 137 | #print (unpickled_object) 138 | 139 | #a = CustomDataset(pickle.loads(pickled_object)) 140 | pp = pprint.PrettyPrinter(indent=4) 141 | pp.pprint (super_dict) 142 | -------------------------------------------------------------------------------- /Document layout analysis/main.py: -------------------------------------------------------------------------------- 1 | from mmdet.apis import inference_detector, show_result_pyplot, init_detector 2 | from mmdet.core import encode_mask_results, tensor2imgs 3 | import cv2 4 | import os 5 | 6 | ################################################### TO DO ################################################### 7 | image_pth = 'Give the image path' 8 | 9 | config_fname = "Give the config file path " 10 | checkpoint_path = 'Give the checkpoint file path' 11 | epoch = 'epoch_6.pth' 12 | 13 | ############################################################################################################# 14 | 15 | model = init_detector(config_fname, checkpoint_path+epoch) 16 | img = cv2.imread(image_pth) 17 | 18 | result = inference_detector(model, img) 19 | #print ("The result is = ",result) 20 | 21 | results = [] 22 | bbox_results, mask_results = result 23 | 24 | res_text= [] 25 | res_title = [] 26 | res_list = [] 27 | res_table = [] 28 | res_figure = [] 29 | all_classes = [] 30 | 31 | #for text 32 | for r in bbox_results[0]: 33 | if r[4]>.85: 34 | res_text.append(r[:4].astype(int)) 35 | 36 | print ("No. of paragraphs on the page are == ",len(res_text)) 37 | all_classes.append(res_text) 38 | 39 | #for title 40 | for r in bbox_results[1]: 41 | if r[4]>.85: 42 | res_title.append(r[:4].astype(int)) 43 | 44 | print ("No. of headers on the page are == ",len(res_title)) 45 | all_classes.append(res_title) 46 | 47 | #for list 48 | for r in bbox_results[2]: 49 | if r[4]>.85: 50 | res_list.append(r[:4].astype(int)) 51 | 52 | print ("No. of lists on the page are == ",len(res_list)) 53 | all_classes.append(res_list) 54 | 55 | #for table 56 | for r in bbox_results[3]: 57 | if r[4]>.85: 58 | res_table.append(r[:4].astype(int)) 59 | 60 | print ("No. of the tables on the page are == ",len(res_table)) 61 | all_classes.append(res_table) 62 | 63 | #for figure 64 | for r in bbox_results[4]: 65 | if r[4]>.85: 66 | res_figure.append(r[:4].astype(int)) 67 | 68 | print ("No. of figures on the page are == ",len(res_figure)) 69 | all_classes.append(res_figure) 70 | 71 | im2 = img.copy() 72 | for count, category in enumerate(all_classes): 73 | #print ("The no. of bbox in these classes are == ",len(category)) 74 | im1 = img.copy() 75 | colors = [(55,255,20), (0,0,255), (132,240,255), (0,247,255), (2,2,105)] 76 | filename = ["paragraph_boxes.jpg", "header_boxes.jpg", "list_boxes.jpg", "tabel_boxes.jpg", "figure_boxes.jpg"] 77 | 78 | for box in category : 79 | #print (count) 80 | #print(colors[count]) 81 | cv2.rectangle(im1, (box[0], box[1]), (box[2], box[3]), colors[count], 2) 82 | cv2.rectangle(im2, (box[0], box[1]), (box[2], box[3]), colors[count], 2) 83 | 84 | directory = '/content/drive/My Drive/results' 85 | os.chdir(directory) 86 | name = filename[count] 87 | #print (name) 88 | cv2.imwrite(name, im1) 89 | 90 | directory = '/content/drive/My Drive/results' 91 | os.chdir(directory) 92 | result_file = "all_annotations.jpg" 93 | cv2.imwrite(result_file, im2) 94 | 95 | encoded_mask_results = encode_mask_results(mask_results) 96 | print ("Encoded mask results are == ",encoded_mask_results) 97 | result = bbox_results, encoded_mask_results 98 | 99 | results.append(result) -------------------------------------------------------------------------------- /Document layout analysis/test_train_split.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sklearn 3 | import os 4 | import glob 5 | import pprint 6 | import shutil 7 | 8 | img_source_dir = '/home/prakhar/Publaynet/Original_data' 9 | train = '/home/prakhar/Publaynet/train' 10 | val = '/home/prakhar/Publaynet/validation' 11 | subdirs = [] 12 | ratio = 0.7 13 | for subdir in os.listdir(img_source_dir): 14 | 15 | print (subdir) 16 | a = os.path.join(img_source_dir, subdir) 17 | subdirs.append(a) 18 | 19 | print (subdirs) 20 | 21 | elements = len(subdirs) 22 | middle = int(elements*ratio) 23 | 24 | train_list = subdirs[:middle] 25 | val_list = subdirs[middle:] 26 | 27 | for f in train_list: 28 | shutil.move(f, train) 29 | 30 | for f in val_list: 31 | shutil.move(f, val) 32 | 33 | 34 | train_path = '/home/prakhar/Publaynet/train/*' 35 | val_path = '/home/prakhar/Publaynet/validation/*' 36 | 37 | train_imgs = glob.glob(train_path) 38 | #print (train_imgs) 39 | val_imgs = glob.glob(val_path) 40 | #print (val_imgs) 41 | 42 | with open('/home/prakhar/Publaynet/Labels/val.json') as f: 43 | data = json.load(f) 44 | 45 | #pp = pprint.PrettyPrinter(indent=4) 46 | #pp.pprint (data) 47 | 48 | def create_dict(imgs, data): 49 | 50 | train_ann = [] 51 | name = [] 52 | super_dict = {} 53 | total = len(imgs) 54 | 55 | for count,i in enumerate(imgs): 56 | 57 | print("Progress : ",count,"/",total) 58 | image_name = os.path.split(i) 59 | name_list = data["images"] 60 | 61 | for j in name_list: 62 | 63 | if j["file_name"] == image_name[1]: 64 | 65 | num = j["id"] 66 | ann = data["annotations"] 67 | name.append(j) 68 | 69 | for k in ann: 70 | 71 | if k["image_id"] == num: 72 | 73 | train_ann.append(k) 74 | 75 | super_dict["annotations"] = train_ann 76 | super_dict["images"] = name 77 | super_dict["categories"] = data["categories"] 78 | 79 | return (super_dict) 80 | 81 | print ("For train") 82 | train_dict = create_dict(train_imgs, data) 83 | pp = pprint.PrettyPrinter(indent=4) 84 | pp.pprint (train_dict) 85 | with open("train_publaynet.json", 'w') as outfile: 86 | json.dump(train_dict, outfile) 87 | 88 | print ("For val") 89 | val_dict = create_dict(val_imgs, data) 90 | pp = pprint.PrettyPrinter(indent=4) 91 | pp.pprint (val_dict) 92 | with open("val_publaynet.json", 'w') as f: 93 | json.dump(val_dict, f) 94 | 95 | 96 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Prakhar-97 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Table-detection-and-Document-layout-analysis 2 | ## Introduction 3 | Using State of the Art techniques for table detection and Document layout analysis. For table detection we are using MMDetection version(1.2), however in Document layout analysis we are using the models which have been developed in MMDetection version(2.0) 4 | 5 | ## Setup 6 | Models are developed in Pytorch based MMdetection framework (Version 2.0) 7 |
8 | 9 |
10 | git clone -'https://github.com/open-mmlab/mmdetection.git'
11 | cd "mmdetection"
12 | python setup.py install
13 | python setup.py develop
14 | pip install -r {"requirements.txt"}
15 | 
16 | 17 | ## Image Augmentation 18 | We have followed Dilation and Smudge techniques for Data Augmentation 19 | 20 |
21 | 22 | 23 | ## Model Zoo 24 | Config file for the Models : 25 | 26 | 27 | 1. For table detection 28 | Config_file
29 | 30 | 2. For Document Analysis 31 | Config_file
32 | 33 | Note: Config paths are only required to change during training 34 | 35 | Checkpoints of the Models that have been trained : 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
Model NameCheckpoint File
Table structure recognitionCheckpoint
Document layout analysisCheckpoint
48 | 49 | ## Datasets 50 | 1. Table detection and Structure Recignition: 51 | You can refer to Dataset to have a better understanding of the Dataset 52 | 53 | 2. Document layout Analysis: 54 | You can refer to Dataset to have a better understanding of the dataset. 55 | 56 | ## Training 57 | 58 | Refer to the two colab notebooks thathave been mentioned as they will direct you through the steps that need to be followed. If using a custom dataset do go through MMdet Docs 59 | 60 | 61 | -------------------------------------------------------------------------------- /literature-survey.md: -------------------------------------------------------------------------------- 1 | # Document layout analysis 2 | 3 | ## Datasets 4 | [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) 5 | * 37 GB 6 | * image classification with 16 classes 7 | 8 | [DocBank](https://arxiv.org/pdf/2006.01038v1.pdf) 9 | * Yet to be released 10 | * author, footer, section, title, abstract, list, paragraph, reference, caption, equation, figure, table 11 | 12 | Other PubLayNet implementations 13 | * [with torch's maskrcnn](https://github.com/phamquiluan/publaynet) 14 | * [with detectron](https://github.com/hpanwar08/detectron2) 15 | --------------------------------------------------------------------------------