├── CascadeTab
    ├── Config
    │   └── cascade_mask_rcnn_hrnetv2p_w32_20e.py
    ├── Functions
    │   ├── blessFunc.py
    │   ├── borderFunc.py
    │   └── line_detection.py
    ├── border.py
    └── main.py
├── Data Preparation
    ├── Dilation.py
    ├── Images
    │   └── 3img.png
    └── Smudge.py
├── Document layout analysis
    ├── Config file
    │   └── cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py
    ├── ICDAR_XML_to_COCO.py
    ├── main.py
    └── test_train_split.py
├── LICENSE.md
├── README.md
├── Table_detection&Structure_recognition.ipynb
└── literature-survey.md


/CascadeTab/Config/cascade_mask_rcnn_hrnetv2p_w32_20e.py:
--------------------------------------------------------------------------------
  1 | # model settings
  2 | model = dict(
  3 |     type='CascadeRCNN',
  4 |     num_stages=3,
  5 |     pretrained='open-mmlab://msra/hrnetv2_w32',
  6 |     backbone=dict(
  7 |         type='HRNet',
  8 |         extra=dict(
  9 |             stage1=dict(
 10 |                 num_modules=1,
 11 |                 num_branches=1,
 12 |                 block='BOTTLENECK',
 13 |                 num_blocks=(4, ),
 14 |                 num_channels=(64, )),
 15 |             stage2=dict(
 16 |                 num_modules=1,
 17 |                 num_branches=2,
 18 |                 block='BASIC',
 19 |                 num_blocks=(4, 4),
 20 |                 num_channels=(32, 64)),
 21 |             stage3=dict(
 22 |                 num_modules=4,
 23 |                 num_branches=3,
 24 |                 block='BASIC',
 25 |                 num_blocks=(4, 4, 4),
 26 |                 num_channels=(32, 64, 128)),
 27 |             stage4=dict(
 28 |                 num_modules=3,
 29 |                 num_branches=4,
 30 |                 block='BASIC',
 31 |                 num_blocks=(4, 4, 4, 4),
 32 |                 num_channels=(32, 64, 128, 256)))),
 33 |     neck=dict(type='HRFPN', in_channels=[32, 64, 128, 256], out_channels=256),
 34 |     rpn_head=dict(
 35 |         type='RPNHead',
 36 |         in_channels=256,
 37 |         feat_channels=256,
 38 |         anchor_scales=[8],
 39 |         anchor_ratios=[0.5, 1.0, 2.0],
 40 |         anchor_strides=[4, 8, 16, 32, 64],
 41 |         target_means=[.0, .0, .0, .0],
 42 |         target_stds=[1.0, 1.0, 1.0, 1.0],
 43 |         loss_cls=dict(
 44 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
 45 |         loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
 46 |     bbox_roi_extractor=dict(
 47 |         type='SingleRoIExtractor',
 48 |         roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
 49 |         out_channels=256,
 50 |         featmap_strides=[4, 8, 16, 32]),
 51 |     bbox_head=[
 52 |         dict(
 53 |             type='SharedFCBBoxHead',
 54 |             num_fcs=2,
 55 |             in_channels=256,
 56 |             fc_out_channels=1024,
 57 |             roi_feat_size=7,
 58 |             num_classes=81,
 59 |             target_means=[0., 0., 0., 0.],
 60 |             target_stds=[0.1, 0.1, 0.2, 0.2],
 61 |             reg_class_agnostic=True,
 62 |             loss_cls=dict(
 63 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
 64 |             loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
 65 |         dict(
 66 |             type='SharedFCBBoxHead',
 67 |             num_fcs=2,
 68 |             in_channels=256,
 69 |             fc_out_channels=1024,
 70 |             roi_feat_size=7,
 71 |             num_classes=81,
 72 |             target_means=[0., 0., 0., 0.],
 73 |             target_stds=[0.05, 0.05, 0.1, 0.1],
 74 |             reg_class_agnostic=True,
 75 |             loss_cls=dict(
 76 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
 77 |             loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
 78 |         dict(
 79 |             type='SharedFCBBoxHead',
 80 |             num_fcs=2,
 81 |             in_channels=256,
 82 |             fc_out_channels=1024,
 83 |             roi_feat_size=7,
 84 |             num_classes=81,
 85 |             target_means=[0., 0., 0., 0.],
 86 |             target_stds=[0.033, 0.033, 0.067, 0.067],
 87 |             reg_class_agnostic=True,
 88 |             loss_cls=dict(
 89 |                 type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
 90 |             loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
 91 |     ],
 92 |     mask_roi_extractor=dict(
 93 |         type='SingleRoIExtractor',
 94 |         roi_layer=dict(type='RoIAlign', out_size=14, sample_num=2),
 95 |         out_channels=256,
 96 |         featmap_strides=[4, 8, 16, 32]),
 97 |     mask_head=dict(
 98 |         type='FCNMaskHead',
 99 |         num_convs=4,
100 |         in_channels=256,
101 |         conv_out_channels=256,
102 |         num_classes=81,
103 |         loss_mask=dict(
104 |             type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)))
105 | # model training and testing settings
106 | train_cfg = dict(
107 |     rpn=dict(
108 |         assigner=dict(
109 |             type='MaxIoUAssigner',
110 |             pos_iou_thr=0.7,
111 |             neg_iou_thr=0.3,
112 |             min_pos_iou=0.3,
113 |             ignore_iof_thr=-1),
114 |         sampler=dict(
115 |             type='RandomSampler',
116 |             num=256,
117 |             pos_fraction=0.5,
118 |             neg_pos_ub=-1,
119 |             add_gt_as_proposals=False),
120 |         allowed_border=0,
121 |         pos_weight=-1,
122 |         debug=False),
123 |     rpn_proposal=dict(
124 |         nms_across_levels=False,
125 |         nms_pre=2000,
126 |         nms_post=2000,
127 |         max_num=2000,
128 |         nms_thr=0.7,
129 |         min_bbox_size=0),
130 |     rcnn=[
131 |         dict(
132 |             assigner=dict(
133 |                 type='MaxIoUAssigner',
134 |                 pos_iou_thr=0.5,
135 |                 neg_iou_thr=0.5,
136 |                 min_pos_iou=0.5,
137 |                 ignore_iof_thr=-1),
138 |             sampler=dict(
139 |                 type='RandomSampler',
140 |                 num=512,
141 |                 pos_fraction=0.25,
142 |                 neg_pos_ub=-1,
143 |                 add_gt_as_proposals=True),
144 |             mask_size=28,
145 |             pos_weight=-1,
146 |             debug=False),
147 |         dict(
148 |             assigner=dict(
149 |                 type='MaxIoUAssigner',
150 |                 pos_iou_thr=0.6,
151 |                 neg_iou_thr=0.6,
152 |                 min_pos_iou=0.6,
153 |                 ignore_iof_thr=-1),
154 |             sampler=dict(
155 |                 type='RandomSampler',
156 |                 num=512,
157 |                 pos_fraction=0.25,
158 |                 neg_pos_ub=-1,
159 |                 add_gt_as_proposals=True),
160 |             mask_size=28,
161 |             pos_weight=-1,
162 |             debug=False),
163 |         dict(
164 |             assigner=dict(
165 |                 type='MaxIoUAssigner',
166 |                 pos_iou_thr=0.7,
167 |                 neg_iou_thr=0.7,
168 |                 min_pos_iou=0.7,
169 |                 ignore_iof_thr=-1),
170 |             sampler=dict(
171 |                 type='RandomSampler',
172 |                 num=512,
173 |                 pos_fraction=0.25,
174 |                 neg_pos_ub=-1,
175 |                 add_gt_as_proposals=True),
176 |             mask_size=28,
177 |             pos_weight=-1,
178 |             debug=False)
179 |     ],
180 |     stage_loss_weights=[1, 0.5, 0.25])
181 | test_cfg = dict(
182 |     rpn=dict(
183 |         nms_across_levels=False,
184 |         nms_pre=1000,
185 |         nms_post=1000,
186 |         max_num=1000,
187 |         nms_thr=0.7,
188 |         min_bbox_size=0),
189 |     rcnn=dict(
190 |         score_thr=0.05,
191 |         nms=dict(type='nms', iou_thr=0.5),
192 |         max_per_img=100,
193 |         mask_thr_binary=0.5))
194 | # dataset settings
195 | dataset_type = 'CocoDataset'
196 | data_root = '/content/drive/My Drive/Mmdetection/'
197 | img_norm_cfg = dict(
198 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
199 | train_pipeline = [
200 |     dict(type='LoadImageFromFile'),
201 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
202 |     dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
203 |     dict(type='RandomFlip', flip_ratio=0.5),
204 |     dict(type='Normalize', **img_norm_cfg),
205 |     dict(type='Pad', size_divisor=32),
206 |     dict(type='DefaultFormatBundle'),
207 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
208 | ]
209 | test_pipeline = [
210 |     dict(type='LoadImageFromFile'),
211 |     dict(
212 |         type='MultiScaleFlipAug',
213 |         img_scale=(1333, 800),
214 |         flip=False,
215 |         transforms=[
216 |             dict(type='Resize', keep_ratio=True),
217 |             dict(type='RandomFlip'),
218 |             dict(type='Normalize', **img_norm_cfg),
219 |             dict(type='Pad', size_divisor=32),
220 |             dict(type='ImageToTensor', keys=['img']),
221 |             dict(type='Collect', keys=['img']),
222 |         ])
223 | ]
224 | data = dict(
225 |     imgs_per_gpu=2,
226 |     workers_per_gpu=2,
227 |     train=dict(
228 |         type=dataset_type,
229 |         ann_file='/content/drive/My Drive/chunk.json',
230 |         img_prefix='/content/drive/My Drive/chunk_images/',
231 |         pipeline=train_pipeline),
232 |     val=dict(
233 |         type=dataset_type,
234 |         ann_file=data_root + 'VOC2007/test.json',
235 |         img_prefix=data_root + 'VOC2007/Test/',
236 |         pipeline=test_pipeline),
237 |     test=dict(
238 |         type=dataset_type,
239 |         ann_file=data_root + 'VOC2007/test.json',
240 |         img_prefix=data_root + 'VOC2007/Test/',
241 |         pipeline=test_pipeline))
242 | # evaluation = dict(interval=1, metric=['bbox'])
243 | # optimizer
244 | optimizer = dict(type='SGD', lr=0.0012, momentum=0.9, weight_decay=0.0001)
245 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
246 | # learning policy
247 | lr_config = dict(
248 |     policy='step',
249 |     warmup='linear',
250 |     warmup_iters=500,
251 |     warmup_ratio=1.0 / 3,
252 |     step=[16, 19])
253 | checkpoint_config = dict(interval=1,create_symlink=False)
254 | # yapf:disable
255 | log_config = dict(
256 |     interval=50,
257 |     hooks=[
258 |         dict(type='TextLoggerHook'),
259 |         # dict(type='TensorboardLoggerHook')
260 |     ])
261 | # yapf:enable
262 | # runtime settings
263 | total_epochs = 36
264 | dist_params = dict(backend='nccl')
265 | log_level = 'INFO'
266 | work_dir = '/content/drive/My Drive/Mmdetection/new_chunk_cascade_mask_rcnn_hrnetv2p_w32_20e'
267 | load_from = None
268 | resume_from = '/content/drive/My Drive/Mmdetection/new_chunk_cascade_mask_rcnn_hrnetv2p_w32_20e/epoch_30.pth'
269 | workflow = [('train', 1)]
270 | 


--------------------------------------------------------------------------------
/CascadeTab/Functions/blessFunc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cv2
  3 | from Functions.borderFunc import extract_table
  4 | import lxml.etree as etree
  5 | import os
  6 | 
  7 | ## Input : roi of one cell
  8 | ## Output : bounding box for the text in that cell
  9 | def extractTextBless(img):
 10 |     return_arr = []
 11 |     h,w=img.shape[0:2]
 12 |     base_size=h+14,w+14,3
 13 |     img_np = np.zeros(base_size,dtype=np.uint8)
 14 |     cv2.rectangle(img_np,(0,0),(w+14,h+14),(255,255,255),30)
 15 |     img_np[7:h+7,7:w+7]=img
 16 | 
 17 |     gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) 
 18 |     # blur = cv2.GaussianBlur(gray,(5,5),0)
 19 |     ret, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
 20 |     rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 1))
 21 |     dilation = cv2.dilate(thresh1, rect_kernel, iterations = 2)
 22 | 
 23 |     contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)
 24 |     for cnt in (contours): 
 25 |         if cv2.contourArea(cnt) < 20:
 26 |           continue
 27 |         x, y, w, h = cv2.boundingRect(cnt) 
 28 |         if(h<6) or w<4 or h/img.shape[0]>0.95 or h>30: 
 29 |           continue
 30 |         return_arr.append([x-7, y-7, w, h])
 31 |     return return_arr
 32 | 
 33 | ## Input : Roi of Table , Orignal Image, Cells Detected
 34 | ## Output : Returns XML element which has contains bounding box of textchunks
 35 | def borderless(table, image, res_cells):
 36 |     cells = []
 37 |     x_lines = []
 38 |     y_lines = []
 39 |     # padding of the table
 40 |     print(table)
 41 |     table[0],table[1],table[2],table[3] = table[0]-15,table[1]-15,table[2]+15,table[3]+15
 42 |     print (table[0])
 43 |     print (table[1])
 44 |     print (table[2])
 45 |     print (table[3])
 46 | 
 47 |     # extracting the cells that might belong to that table
 48 |     for cell in res_cells:
 49 |         if cell[0]>table[0]-50 and cell[1]>table[1]-50 and cell[2]<table[2]+50 and cell[3]<table[3]+50:
 50 |             cells.append(cell)
 51 |             # print(cell)
 52 |     cells = sorted(cells,key=lambda x: x[3])
 53 |     print ("cells")
 54 |     row = []
 55 |     last = -1111
 56 |     row.append(table[1])
 57 |     print (row)
 58 |     y_lines.append([table[0],table[1],table[2],table[1]])
 59 |     temp = -1111
 60 |     prev = None
 61 |     im2 = image.copy()
 62 | 
 63 |     #finds the no. of rows in the table
 64 |     for i, cell in enumerate(cells):
 65 |         if i == 0:
 66 |             last = cell[1]
 67 |             temp = cell[3]
 68 |         elif (cell[1]<last+15 and cell[1]>last-15) or (cell[3]<temp+15 and cell[3]>temp-15):
 69 |             if cell[3]>temp:
 70 |                 temp = cell[3]
 71 |         else:
 72 |             last = cell[1]
 73 |             if last > temp:
 74 |               row.append((last+temp)//2)
 75 |             if prev is not None:
 76 |                 if ((last+temp)//2) < prev + 10 or ((last+temp)//2) < prev - 10:
 77 |                     row.pop()
 78 |             prev = (last+temp)//2
 79 |             temp = cell[3]
 80 |       
 81 |     row.append(table[3]+50)
 82 |     i=1
 83 |     rows = []
 84 |     for r in range(len(row)):
 85 |         rows.append([])
 86 |     # rows creates a empty matrix with the no. of entries equal to the no. of rows in the table
 87 |     final_rows = rows
 88 |     maxr = -111
 89 |     # print(len(row))
 90 |     #It stores all the cells to the specific rows
 91 |     for cell in cells:
 92 |         if cell[3]<row[i]:
 93 |             rows[i-1].append(cell)
 94 |         else:
 95 |             i+=1
 96 |             rows[i-1].append(cell)
 97 | 
 98 |     # print(row)
 99 |     # sorting all the cells in each row acc. to x1-coordinate
100 |     for n,r1 in enumerate(rows):
101 |         if n==len(rows):
102 |             r1 = r1[:-1]
103 |         # print(r1)
104 |         r1 = sorted(r1,key=lambda x:x[0])
105 |         prevr = None
106 |         # if there are two celss masks detected in very close proximity then eliminating the one with low confidence score
107 |         for no,r in enumerate(r1):
108 |             if prevr is not None:
109 |                 # print(r[0],prevr[0])
110 |                 if (r[0]<=prevr[0]+5 and r[0]>=prevr[0]-5) or (r[2]<=prevr[2]+5 and r[2]>=prevr[2]-5):
111 |                     if r[4]<prevr[4]:
112 |                         r1.pop(no)
113 |                     else:
114 |                         r1.pop(no-1)
115 |             prevr = r
116 |           # print(len(r1))
117 |         #saving all the celss row wise and in the right order in their respective rows in final_rows
118 |         final_rows[n] = r1
119 |     lasty = []
120 |     for x in range(len(final_rows)):
121 |       lasty.append([99999999,0])
122 | 
123 |     prev = None
124 |     #Storing the min. y1 and max. y2 for each row in last_y
125 |     for n,r1 in enumerate(final_rows):
126 |       for r in r1:
127 |          if prev is None:
128 |             prev = r
129 |          else:
130 |             if r[1]<prev[3]:
131 |               continue
132 | 
133 |          if r[1]<lasty[n][0]:
134 |            lasty[n][0] = r[1]
135 |          if r[3]>lasty[n][1]:
136 |            lasty[n][1] = r[3]
137 |     # print("last y:",lasty)
138 | 
139 |     # taking the mid value of the prev y2 and the cuurent y1 in a row so as to find the right y coordinate for the row line
140 |     row = []
141 |     row.append(table[1])
142 |     prev = None
143 |     pr = None
144 |     for x in range(len(lasty)-1):
145 |       if x==0 and prev==None:
146 |         prev = lasty[x]
147 |       else:
148 |         if pr is not None:
149 |           if abs(((lasty[x][0]+prev[1])//2)-pr)<=10:
150 |             row.pop()
151 |             row.append((lasty[x][0]+prev[1])//2)
152 |           else:
153 |             row.append((lasty[x][0]+prev[1])//2)
154 |         else:
155 |           row.append((lasty[x][0]+prev[1])//2)
156 |         pr = (lasty[x][0]+prev[1])//2
157 |         prev = lasty[x]
158 |     row.append(table[3])
159 | 
160 |     #finding the max. no. of cells in all the rows which is equal to the number of columns
161 |     maxr = 0
162 |     for r2 in final_rows:
163 |         print(r2)
164 |         if len(r2)>maxr:
165 |             maxr = len(r2)
166 |         
167 | 
168 |     lastx = []
169 |     # acc. to the x1 and x2 coordinates of each cell in a row, finding the x1 and x2 coordinates for each column
170 |     for n in range(maxr):
171 |         lastx.append([999999999,0])
172 | 
173 |     for r2 in final_rows:
174 |         if len(r2)==maxr:
175 |           for n,col in enumerate(r2):
176 |               # print(col)
177 |               if col[2]>lastx[n][1]:
178 |                   lastx[n][1] = col[2]
179 |               if col[0]<lastx[n][0]:
180 |                   lastx[n][0] = col[0]
181 | 
182 |     print(lastx)
183 |     for r2 in final_rows:
184 |       if len(r2)!=0:
185 |         r=0
186 |         for n,col in enumerate(r2):
187 |           while r!=len(r2)-1 and (lastx[n][0]>r2[r][0]):
188 |               r +=1
189 |           if n != 0:
190 |             if r2[r-1][0] > lastx[n-1][1]:
191 |               if r2[r-1][0]<lastx[n][0]:
192 |                   lastx[n][0] = r2[r-1][0]
193 |     for r2 in final_rows:
194 |         for n,col in enumerate(r2):
195 |           if n != len(r2)-1:  
196 |             if col[2] < lastx[n+1][0]:
197 |               if col[2]>lastx[n][1]:
198 |                   lastx[n][1] = col[2]
199 | 
200 |     # for each column takin the mid value of prev x2 and current x1 so as to draw x1 lines
201 |     print(lastx)
202 |     col = np.zeros(maxr+1)
203 |     col[0] = table[0]
204 |     prev = 0
205 |     i = 1
206 |     for x in range(len(lastx)):
207 |       if x==0:
208 |         prev = lastx[x]
209 |       else:
210 |         col[i] = (lastx[x][0]+prev[1])//2
211 |         i+=1 
212 |         prev = lastx[x]
213 |     col = col.astype(int)
214 |     col[maxr] = table[2]
215 | 
216 |     _row_ = sorted(row, key=lambda x:x)
217 |     _col_ = sorted(col, key=lambda x:x)
218 | 
219 |     #drawing lines acc. to the values(drawing the row and the column lines)
220 |     for no,c in enumerate(_col_):
221 |         x_lines.append([c,table[1],c,table[3]])
222 |         cv2.line(im2,(c,table[1]),(c,table[3]),(255,0,0),1)
223 |     for no,c in enumerate(_row_):
224 |       y_lines.append([table[0],c,table[2],c])
225 |       cv2.line(im2,(table[0],c),(table[2],c),(255,0,0),1)
226 |     
227 |     # cv2_imshow(im2)
228 |     print("table:",table)
229 |     # for r in row:
230 |     #   cv2.line(im2,(r,table[1]),(r,table[3]),(0,255,0),1)
231 |     # for c in col:
232 |     #   cv2.line(im2,(c,table[1]),(c,table[3]),(0,255,0),1)
233 |     final = extract_table(image[table[1]:table[3],table[0]:table[2]],0,(y_lines,x_lines))
234 | 
235 |     cellBoxes = []
236 |     img4 = image.copy()
237 |     for box in final:
238 |         cellBox = extractTextBless(image[box[1]:box[3],box[0]:box[4]])
239 |         for cell in cellBox:
240 |             cellBoxes.append([box[0]+cell[0], box[1]+cell[1], cell[2], cell[3]])
241 |             cv2.rectangle(img4, (box[0]+cell[0], box[1]+cell[1]), (box[0]+cell[0]+cell[2], box[1]+cell[1]+cell[3]), (255,0,0), 2)
242 | 
243 |     # cv2_imshow(img4)
244 | 
245 |     the_last_y = -1
246 |     cellBoxes = sorted(cellBoxes,key=lambda x: x[1])
247 |     cellBoxes2BeMerged = [] 
248 |     cellBoxes2BeMerged.append([])
249 |     rowCnt = 0
250 |     for cell in cellBoxes:
251 |         if(the_last_y == -1):
252 |           the_last_y = cell[1]
253 |           cellBoxes2BeMerged[rowCnt].append(cell)
254 |           continue
255 |         if(abs(cell[1]-the_last_y) < 8):
256 |           cellBoxes2BeMerged[rowCnt].append(cell)
257 |         else:
258 |           the_last_y=cell[1]
259 |           rowCnt+=1
260 |           cellBoxes2BeMerged.append([])
261 |           cellBoxes2BeMerged[rowCnt].append(cell)
262 | 
263 |     MergedBoxes = []
264 |     for cellrow in cellBoxes2BeMerged:
265 |       cellrow = sorted(cellrow,key=lambda x: x[0])
266 |       cur_cell = -1
267 |       for c,cell in enumerate(cellrow):
268 |         if(cur_cell == -1):
269 |           cur_cell = cell
270 |           continue
271 |         if(len(cellrow)==1):
272 |           MergedBoxes.append(cell)
273 |           break
274 |         if(abs((cur_cell[0]+cur_cell[2])-cell[0]) < 10):
275 |           cur_cell[2] = cur_cell[2] + cell[2] + (cell[0]- (cur_cell[0]+cur_cell[2]))
276 |           if(cur_cell[3]<cell[3]):
277 |             cur_cell[3]=cell[3]
278 |         else:
279 |           cur_cell[2] = cur_cell[0]+cur_cell[2]
280 |           cur_cell[3] = cur_cell[1]+cur_cell[3]
281 |           MergedBoxes.append(cur_cell)
282 |           cur_cell = cell
283 |       cur_cell[2] = cur_cell[0]+cur_cell[2]
284 |       cur_cell[3] = cur_cell[1]+cur_cell[3]
285 |       MergedBoxes.append(cur_cell)  
286 | 
287 |     im3 = image.copy()
288 |     for bx in MergedBoxes:
289 |       cv2.rectangle(im3, (bx[0], bx[1]), (bx[2], bx[3]), (255,0,0), 2)
290 | 
291 |     # cv2_imshow(im3)
292 |     TextChunks = []
293 |     TextChunks.append([])
294 |     rcnt = 0
295 |     ycnt = -1
296 | 
297 |     final = sorted(final,key=lambda x:x[1])
298 |     for box in final:
299 |       if(ycnt == -1):
300 |         ycnt = box[1]
301 |       tcurcell = []
302 |       mcurcell = []
303 |       for mbox in MergedBoxes:
304 |         if(mbox[0] >= box[0] and mbox[1] >= box[1] and mbox[2] <= box[4] and mbox[3] <= box[3]):
305 |           if(len(tcurcell) == 0):
306 |             tcurcell = mbox
307 |           else:
308 |             if(mbox[0] < tcurcell[0]):
309 |               tcurcell[0] = mbox[0]
310 |             if(mbox[1] < tcurcell[1]):
311 |               tcurcell[1] = mbox[1]  
312 |             if(mbox[2] > tcurcell[2]):
313 |               tcurcell[2] = mbox[2]
314 |             if(mbox[3] > tcurcell[3]):
315 |               tcurcell[3] = mbox[3]  
316 | 
317 |       for i,frow in enumerate(final_rows):
318 |         for j,fbox in enumerate(frow):
319 |           if(fbox[0] >= box[0] and fbox[0] <= box[4] and fbox[1] >= box[1] and fbox[1] <= box[3]):
320 |             mcurcell = fbox
321 |             final_rows[i].pop(j)
322 |             break  
323 | 
324 |       if(abs(ycnt-box[1])>10):
325 |         rcnt+=1
326 |         TextChunks.append([])
327 |         ycnt = box[1]
328 | 
329 |       if(len(tcurcell)==0):
330 |         if(len(mcurcell)==0):
331 |           continue
332 |         else:
333 |           TextChunks[rcnt].append(mcurcell)
334 |       else:
335 |         if(len(mcurcell)==0):
336 |           TextChunks[rcnt].append(tcurcell)
337 |         else:
338 |           if(abs(mcurcell[0] - tcurcell[0])<=20 and abs(mcurcell[1] - tcurcell[1])<=20 and abs(mcurcell[2] - tcurcell[2])<=20 and abs(mcurcell[3] - tcurcell[3])<=20):
339 |             TextChunks[rcnt].append(tcurcell)
340 |           elif((abs(mcurcell[0] - tcurcell[0])<=20 and abs(mcurcell[2] - tcurcell[2])<=20) or (abs(mcurcell[1] - tcurcell[1])<=20 or abs(mcurcell[3] - tcurcell[3])<=20)):
341 |             TextChunks[rcnt].append(mcurcell)
342 |           else:
343 |             TextChunks[rcnt].append(tcurcell)
344 | 
345 |     colors = [(255,0,0),(0,255,0),(0,0,255),(125,125,0),(0,255,255)]
346 |     for no,r in enumerate(TextChunks):
347 |       for tbox in r:
348 |         cv2.rectangle(im2, (tbox[0], tbox[1]), (tbox[2], tbox[3]), colors[no%len(colors)], 1)
349 |         # print(tbox)
350 |     # cv2_imshow("text chunks", im2)
351 |     # cv2.waitKey(0)
352 | 
353 |     def rowstart(val):
354 |       r = 0
355 |       while(val > _row_[r]):
356 |         r += 1  
357 |       if r-1 == -1:
358 |         return r
359 |       else:
360 |         return r-1
361 |         
362 |     def rowend(val):
363 |       r = 0
364 |       while(val > _row_[r]):
365 |         r += 1  
366 |       if r-1 == -1:
367 |         return r
368 |       else:
369 |         return r-1
370 | 
371 |     def colstart(val):
372 |       r = 0
373 |       while(r < len(_col_) and val > _col_[r]):
374 |         r += 1
375 |       if r-1 == -1:
376 |         return r
377 |       else:
378 |         return r-1
379 |     
380 |     def colend(val):
381 |       r = 0
382 |       while(r < len(_col_) and val > _col_[r]):
383 |         r += 1
384 |       if r-1 == -1:
385 |         return r
386 |       else:
387 |         return r-1
388 |     
389 |     tableXML = etree.Element("table")
390 |     Tcoords = etree.Element("Coords", points=str(table[0])+","+str(table[1])+" "+str(table[0])+","+str(table[3])+" "+str(table[2])+","+str(table[3])+" "+str(table[2])+","+str(table[1]))
391 |     tableXML.append(Tcoords)
392 |     for final in TextChunks:
393 |       for box in final:
394 |         cell = etree.Element("cell")
395 |         end_col,end_row,start_col,start_row = colend(box[2]),rowend(box[3]),colstart(box[0]),rowstart(box[1])
396 |         cell.set("end-col",str(end_col))
397 |         cell.set("end-row",str(end_row))
398 |         cell.set("start-col",str(start_col))
399 |         cell.set("start-row",str(start_row))
400 | 
401 |         # print(cellBox)
402 |         one = str(box[0])+","+str(box[1])
403 |         two = str(box[0])+","+str(box[3])
404 |         three = str(box[2])+","+str(box[3])
405 |         four = str(box[2])+","+str(box[1])
406 |         # print(one)
407 |         coords = etree.Element("Coords", points=one+" "+two+" "+three+" "+four)
408 | 
409 |         cell.append(coords)
410 |         tableXML.append(cell)
411 | 
412 |     return tableXML
413 | 


--------------------------------------------------------------------------------
/CascadeTab/Functions/borderFunc.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | from Functions.line_detection import line_detection
  3 | import os
  4 | 
  5 | ##################  Functions required for Border table Recognition #################
  6 | 
  7 | ## Return the intersection of lines only if intersection is present ##
  8 | # Input : x1, y1, x2, y2, x3, y3, x4, y4 (1: vertical, 2: horizontal)
  9 | # Output : (x,y) Intersection point
 10 | def line_intersection(x1, y1, x2, y2, x3, y3, x4, y4):
 11 |     # print(x1, y1, x2, y2)
 12 |     # print(x3, y3, x4, y4)
 13 |     
 14 |     if((x1>= x3-5 or x1>= x3+5) and (x1 <= x4+5 or x1 <= x4-5) and (y3+8>=min(y1,y2) or y3-5>=min(y1,y2)) and y3<=max(y1,y2)+5):
 15 |         return x1,y3
 16 | 
 17 | 
 18 | ## main extraction function ##
 19 | # Input : Image, Decision parameter(1/0),lines for borderless (only of decision parameter is 0)
 20 | # Output : Array of cells
 21 | def extract_table(table_body,__line__,lines=None):
 22 |     # Deciding variable
 23 |     #print (table_body)
 24 |     print (__line__)
 25 |     if(__line__ == 1 ):
 26 |     # Check if table image is  bordered or borderless
 27 |         temp_lines_hor, temp_lines_ver = line_detection(table_body)
 28 |         print ("temp_lines_hor",temp_lines_hor)
 29 |         print ("temp_lines_ver",temp_lines_ver)
 30 |     else:
 31 |         temp_lines_hor, temp_lines_ver = lines
 32 | 
 33 |     if (temp_lines_hor is None) or (temp_lines_ver is None):
 34 |         print("Either Horizontal Or Vertical Lines Not Detected")
 35 |         return None
 36 | 
 37 |     table = table_body.copy()		
 38 |     x = 0
 39 |     y = 0
 40 |     k = 0
 41 |     points = []
 42 |     print("[Table status] : Processing table with lines")
 43 |     # Remove same lines detected closer
 44 |     for x1, y1, x2, y2 in temp_lines_ver:
 45 |         point = []
 46 |         for x3, y3, x4, y4 in temp_lines_hor:
 47 |             try:
 48 |                 k += 1
 49 |                 x, y = line_intersection(x1, y1, x2, y2, x3, y3, x4, y4)
 50 |                 point.append([x, y])
 51 |             except:
 52 |                 continue
 53 |         points.append(point)
 54 | 
 55 |     for point in points:
 56 |         for x,y in point:
 57 |             cv2.line(table,(x,y),(x,y),(0,0,255),8)
 58 | 
 59 |     '''directory = '/content/drive/My Drive/Optum/Dataset/images'
 60 |     os.chdir(directory)
 61 |     
 62 |     filename = "intersection.jpg"
 63 |     cv2.imwrite(filename, table)'''
 64 |     # cv2_imshow("intersection",table)
 65 |     # cv2.waitKey(0)
 66 | 
 67 |     # boxno = -1
 68 |     box = []
 69 |     flag = 1
 70 |     lastCache = []
 71 |     ## creating bounding boxes of cells from the points detected 
 72 |     ## This is still under work and might fail on some images
 73 |     for i, row in enumerate(points):
 74 |         limitj = len(row)
 75 |         currentVala = []
 76 |         for j, col in enumerate(row):
 77 | 
 78 |             if (j == limitj-1):
 79 |                 break
 80 |             if (i == 0):
 81 |                 nextcol = row[j+1]
 82 |                 lastCache.append([col[0], col[1], nextcol[0], nextcol[1],9999,9999,9999,9999])
 83 |             else:
 84 |                 nextcol = row[j+1]
 85 |                 currentVala.append([col[0], col[1], nextcol[0], nextcol[1], 9999, 9999, 9999, 9999])
 86 |                 # Matching 
 87 |                 flag = 1
 88 |                 index = []                
 89 |                 for k, last in enumerate(lastCache):
 90 | 
 91 |                     if (col[1] == last[1]) and lastCache[k][4] == 9999:
 92 |                         lastCache[k][4] = col[0]
 93 |                         lastCache[k][5] = col[1]
 94 |                         if lastCache[k][4] != 9999 and lastCache[k][6] != 9999:    
 95 |                             box.append(lastCache[k])
 96 |                             index.append(k)
 97 |                             flag = 1
 98 | 
 99 |                     if (nextcol[1] == last[3]) and lastCache[k][6] == 9999:
100 |                         lastCache[k][6] = nextcol[0]
101 |                         lastCache[k][7] = nextcol[1]
102 |                         if lastCache[k][4] != 9999 and lastCache[k][6] != 9999:    
103 |                             box.append(lastCache[k])
104 |                             index.append(k)
105 |                             flag = 1
106 |                     
107 |                     if len(lastCache) !=0:
108 |                         if lastCache[k][4] == 9999 or lastCache[k][6] == 9999:
109 |                             flag = 0
110 |                 # print(index)
111 |                 for k in index:
112 |                       lastCache.pop(k)
113 |                 # tranfsering
114 |                 if flag == 0:
115 |                     for last in lastCache:
116 |                         if last[4] == 9999 or last[6] == 9999:
117 |                             currentVala.append(last)
118 | 
119 |         if(i!=0):
120 |             lastCache = currentVala
121 | 
122 |     ## Visualizing the cells ##
123 |     count = 1
124 |     for i in box:
125 |         cv2.rectangle(table_body, (i[0], i[1]), (i[6], i[7]), (int(i[7]%255),0,int(i[0]%255)), 2)
126 |         count+=1
127 | 
128 |     '''directory = '/content/drive/My Drive/Optum/Dataset/images'
129 |     os.chdir(directory)
130 |     
131 |     filename = "cells.jpg"
132 |     cv2.imwrite(filename, table_body)'''
133 |     # cv2_imshow("cells",table_body)
134 |     # cv2.waitKey(0)
135 |     ############################
136 |     return box
137 | # extract_table(cv2.imread("E:\\KSK\\KSK ML\\KSK PAPERS\\TabXNet\\For Git\\images\\table.PNG"),1,lines=None)
138 | 
139 | 
140 | def findX(X,x):
141 |     return X.index(x)
142 | def findY(Y,y):
143 |     return Y.index(y)
144 | 
145 | def span(box,X,Y):
146 |     start_col = findX(X,box[0])     ## x1
147 |     end_col = findX(X,box[4])-1     ## x3
148 |     start_row = findY(Y,box[1])     ## y1
149 |     end_row = findY(Y,box[3])-1     ## y2
150 |     # print(end_col,end_row,start_col,start_row)
151 |     return end_col,end_row,start_col,start_row
152 | 
153 | 
154 | 
155 | def extractText(img):
156 |     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 
157 |     _, thresh1 = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
158 |     # cv2_imshow(thresh1)
159 |     rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))
160 |     dilation = cv2.dilate(thresh1, rect_kernel, iterations = 2)
161 |     contours, _ = cv2.findContours(dilation, cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE)
162 |     im2 = img.copy()
163 |     mx,my,mw,mh = float('Inf'),float('Inf'),-1,-1
164 |     for cnt in contours: 
165 |         x, y, w, h = cv2.boundingRect(cnt) 
166 |         # print(im2.shape)
167 |         if x<2 or y<2 or (x+w>=im2.shape[1]-1 and y+h>=im2.shape[0]-1) or w>=im2.shape[1]-1 or h>=im2.shape[0]-1:
168 |             continue  
169 |         if x<mx:
170 |             mx = x
171 |         if y<my:
172 |             my = y
173 |         if x+w>mw:
174 |             mw = x+w
175 |         if y+h>mh:
176 |             mh = y+h
177 |         # print(x, y, w, h)
178 | 
179 |     if mx !=float('Inf') and my !=float('Inf'):
180 |         # Drawing a rectangle on copied image 
181 |         # rect = cv2.rectangle(im2, (mx+1, my), (mw-2, mh-2), (0, 255, 0), 1)
182 |         # cv2_imshow(im2)
183 |         return mx,my,mw,mh
184 |     else :
185 |         return None


--------------------------------------------------------------------------------
/CascadeTab/Functions/line_detection.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import os
  4 | import random
  5 | 
  6 | # Input : Image
  7 | # Output : hor,ver 
  8 | def line_detection(image):
  9 | 
 10 |     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 11 |     bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, 1)
 12 |     bw = cv2.bitwise_not(bw)
 13 |     
 14 |     '''directory = '/content/drive/My Drive/Optum/Dataset/images'
 15 |     os.chdir(directory)
 16 |     n = random.random()
 17 |     print ("Random number generated is",n)
 18 | 
 19 |     filename = str(n) + "after_threshold.jpg"
 20 |     cv2.imwrite(filename, bw)'''
 21 |     ## To visualize image after thresholding ##
 22 |     # cv2_imshow("bw",bw)
 23 |     # cv2.waitKey(0)
 24 |     ###########################################
 25 |     horizontal = bw.copy()
 26 |     vertical = bw.copy()
 27 |     img = image.copy()
 28 |     # [horizontal lines]
 29 |     # Create structure element for extracting horizontal lines through morphology operations
 30 |     horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 1))
 31 | 
 32 |     # Apply morphology operations
 33 |     horizontal = cv2.erode(horizontal, horizontalStructure)
 34 |     horizontal = cv2.dilate(horizontal, horizontalStructure)
 35 | 
 36 |     horizontal = cv2.dilate(horizontal, (1,1), iterations=5)
 37 |     horizontal = cv2.erode(horizontal, (1,1), iterations=5)
 38 | 
 39 |     '''directory = '/content/drive/My Drive/Optum/Dataset/images'
 40 |     os.chdir(directory)
 41 |     n = random.random()
 42 |     print ("Random number generated is",n)
 43 | 
 44 |     filename = str(n) + "hor_lines_highlighted.jpg"
 45 |     cv2.imwrite(filename, horizontal)'''
 46 | 
 47 |     ## Uncomment to visualize highlighted Horizontal lines
 48 |     # cv2_imshow("horizontal",horizontal)
 49 |     # cv2.waitKey(0)
 50 | 
 51 |     # HoughlinesP function to detect horizontal lines
 52 |     hor_lines = cv2.HoughLinesP(horizontal,rho=1,theta=np.pi/180,threshold=100,minLineLength=30,maxLineGap=3)
 53 |     if hor_lines is None:
 54 |         return None,None
 55 |     temp_line = []
 56 |     for line in hor_lines:
 57 |         for x1,y1,x2,y2 in line:
 58 |             temp_line.append([x1,y1-5,x2,y2-5])
 59 | 
 60 |     # Sorting the list of detected lines by Y1
 61 |     hor_lines = sorted(temp_line,key=lambda x: x[1])
 62 |     print ("hor_lines",hor_lines)
 63 | 
 64 | 
 65 |     ## Uncomment this part to visualize the lines detected on the image ##
 66 |     print(len(hor_lines))
 67 |     for x1, y1, x2, y2 in hor_lines:
 68 |         cv2.line(image, (x1,y1), (x2,y2), (0, 255, 0), 1)
 69 | 
 70 | 
 71 |     '''directory = '/content/drive/My Drive/Optum/Dataset/images'
 72 |     os.chdir(directory)
 73 |     n = random.random()
 74 |     print ("Random number generated is",n)
 75 | 
 76 |     filename = str(n) + "hor_lines_detected.jpg"
 77 |     cv2.imwrite(filename, image)'''
 78 |     # print(image.shape)
 79 |     # cv2_imshow("image",image)
 80 |     # cv2.waitKey(0)
 81 |     ####################################################################
 82 | 
 83 |     ## Selection of best lines from all the horizontal lines detected ##
 84 |     lasty1 = -111111
 85 |     lines_x1 = []
 86 |     lines_x2 = []
 87 |     hor = []
 88 |     i=0
 89 |     for x1,y1,x2,y2 in hor_lines:
 90 |         if y1 >= lasty1 and y1 <= lasty1 + 10:
 91 |             lines_x1.append(x1)
 92 |             lines_x2.append(x2)
 93 |         else:
 94 |             if (i != 0 and len(lines_x1) is not 0):
 95 |                 hor.append([min(lines_x1),lasty1,max(lines_x2),lasty1])
 96 |             lasty1 = y1
 97 |             lines_x1 = []
 98 |             lines_x2 = []
 99 |             lines_x1.append(x1)
100 |             lines_x2.append(x2)
101 |             i+=1
102 |     hor.append([min(lines_x1),lasty1,max(lines_x2),lasty1])
103 |     #####################################################################
104 | 
105 | 
106 |     # [vertical lines]
107 |     # Create structure element for extracting vertical lines through morphology operations
108 |     verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 15))
109 | 
110 |     # Apply morphology operations
111 |     vertical = cv2.erode(vertical, verticalStructure)
112 |     vertical = cv2.dilate(vertical, verticalStructure)
113 | 
114 |     vertical = cv2.dilate(vertical, (1,1), iterations=8)
115 |     vertical = cv2.erode(vertical, (1,1), iterations=7)
116 | 
117 |     ######## Preprocessing Vertical Lines ###############
118 |     # cv2_imshow("vertical",vertical)
119 |     # cv2.waitKey(0)
120 |     #####################################################
121 | 
122 |     # HoughlinesP function to detect vertical lines
123 |     # ver_lines = cv2.HoughLinesP(vertical,rho=1,theta=np.pi/180,threshold=20,minLineLength=20,maxLineGap=2)
124 |     ver_lines = cv2.HoughLinesP(vertical, 1, np.pi/180, 20, np.array([]), 20, 2)
125 |     if ver_lines is None:
126 |         return None,None
127 |     temp_line = []
128 |     for line in ver_lines:
129 |         for x1,y1,x2,y2 in line:
130 |             temp_line.append([x1,y1,x2,y2])
131 | 
132 |     # Sorting the list of detected lines by X1
133 |     ver_lines = sorted(temp_line,key=lambda x: x[0])
134 |     print ("ver_lines",ver_lines)
135 |     ## Uncomment this part to visualize the lines detected on the image ##
136 |     print(len(ver_lines))
137 |     for x1, y1, x2, y2 in ver_lines:
138 |         cv2.line(image, (x1,y1-5), (x2,y2-5), (0, 255, 0), 1)
139 | 
140 | 
141 |     '''directory = '/content/drive/My Drive/Optum/Dataset/images'
142 |     os.chdir(directory)
143 |     n = random.random()
144 |     print ("Random number generated is",n)
145 | 
146 |     filename = str(n) + "ver_lines_detected.jpg"
147 |     cv2.imwrite(filename, image)'''
148 |     
149 |     # print(image.shape)
150 |     # cv2_imshow("image",image)
151 |     # cv2.waitKey(0)
152 |     ####################################################################
153 | 
154 |     ## Selection of best lines from all the vertical lines detected ##
155 |     lastx1 = -111111
156 |     lines_y1 = []
157 |     lines_y2 = []
158 |     ver = []
159 |     count = 0
160 |     lasty1 = -11111
161 |     lasty2 = -11111
162 |     for x1,y1,x2,y2 in ver_lines:
163 |         if x1 >= lastx1 and x1 <= lastx1 + 15 and not (((min(y1,y2)<min(lasty1,lasty2)-20 or min(y1,y2)<min(lasty1,lasty2)+20)) and ((max(y1,y2)<max(lasty1,lasty2)-20 or max(y1,y2)<max(lasty1,lasty2)+20))):
164 |             lines_y1.append(y1)
165 |             lines_y2.append(y2)
166 |             # lasty1 = y1
167 |             # lasty2 = y2
168 |         else:
169 |             if (count != 0 and len(lines_y1) is not 0):
170 |                 ver.append([lastx1,min(lines_y2)-5,lastx1,max(lines_y1)-5])
171 |             lastx1 = x1
172 |             lines_y1 = []
173 |             lines_y2 = []
174 |             lines_y1.append(y1)
175 |             lines_y2.append(y2)
176 |             count += 1
177 |             lasty1 = -11111
178 |             lasty2 = -11111
179 |     ver.append([lastx1,min(lines_y2)-5,lastx1,max(lines_y1)-5])
180 |     #################################################################
181 | 
182 | 
183 |     ############ Visualization of Lines After Post Processing ############
184 |     for x1, y1, x2, y2 in ver:
185 |         cv2.line(img, (x1,y1), (x2,y2), (0, 255, 0), 1)
186 | 
187 |     for x1, y1, x2, y2 in hor:
188 |         cv2.line(img, (x1,y1), (x2,y2), (0, 255, 0), 1)
189 |     
190 |     '''directory = '/content/drive/My Drive/Optum/Dataset/images'
191 |     os.chdir(directory)
192 |     n = random.random()
193 |     print ("Random number generated is",n)
194 | 
195 |     filename = str(n) + "all_lines_detected.jpg"
196 |     cv2.imwrite(filename, img)'''
197 | 
198 |     # cv2_imshow("image",img)
199 |     # cv2.waitKey(0)
200 |     #######################################################################
201 | 
202 |     print("hor",hor)
203 |     print("ver",ver)
204 | 
205 |     return hor,ver
206 | 
207 | # line_detection(cv2.imread('path to image'))


--------------------------------------------------------------------------------
/CascadeTab/border.py:
--------------------------------------------------------------------------------
 1 | from Functions.borderFunc import extract_table,extractText,span
 2 | import lxml.etree as etree
 3 | import cv2
 4 | import os
 5 | 
 6 | # Input : table coordinates [x1,y1,x2,y2]
 7 | # Output : XML Structure for ICDAR 19 single table
 8 | def border(table,image):
 9 | 
10 |     print(table)
11 |     image_np = image#[table[1]-10:table[3]+10,table[0]-10:table[2]+10]
12 |     imag = image.copy()
13 |     final = extract_table(image_np,1)
14 |     print ("The extracted table is(box from borderFunc)",final)
15 |     if final is None:
16 |         return None
17 |     X = []
18 |     Y = []
19 |     for x1,y1,x2,y2,x3,y3,x4,y4 in final:
20 |         if x1 not in X:
21 |             X.append(x1)
22 |         if x3 not in X:
23 |             X.append(x3)
24 |         if y1 not in Y:
25 |             Y.append(y1)
26 |         if y2 not in Y:
27 |             Y.append(y2)
28 | 
29 |     X.sort()
30 |     Y.sort()
31 |     print("X = ",X)
32 |     print("Y = ",Y)
33 | 
34 |     tableXML = etree.Element("table")
35 |     Tcoords = etree.Element("Coords", points=str(table[0])+","+str(table[1])+" "+str(table[2])+","+str(table[3])+" "+str(table[2])+","+str(table[3])+" "+str(table[2])+","+str(table[1]))
36 |     tableXML.append(Tcoords)
37 |     cv2.rectangle(imag,(table[0],table[1]),(table[2],table[3]),(0,255,0),2)
38 |     for box in final:
39 |         if box[0]>table[0]-5 and box[1]>table[1]-5 and box[2]<table[2]+5 and box[3]<table[3]+5:
40 |             cellBox = extractText(imag[box[1]:box[3],box[0]:box[4]])
41 |             if cellBox is None:
42 |                 continue
43 |             ## to visualize the detected text areas
44 |             cv2.rectangle(imag,(cellBox[0]+box[0],cellBox[1]+box[1]),(cellBox[2]+box[0],cellBox[3]+box[1]),(255,0,0),2)
45 |             cell = etree.Element("cell")
46 |             end_col,end_row,start_col,start_row = span(box,X,Y)
47 |             cell.set("end-col",str(end_col))
48 |             cell.set("end-row",str(end_row))
49 |             cell.set("start-col",str(start_col))
50 |             cell.set("start-row",str(start_row))
51 | 
52 |             one = str(cellBox[0]+box[0])+","+str(cellBox[1]+box[1])
53 |             two = str(cellBox[0]+box[0])+","+str(cellBox[3]+box[1])
54 |             three = str(cellBox[2]+box[0])+","+str(cellBox[3]+box[1])
55 |             four = str(cellBox[2]+box[0])+","+str(cellBox[1]+box[1])
56 | 
57 |             coords = etree.Element("Coords", points=one+" "+two+" "+three+" "+four)
58 | 
59 |             cell.append(coords)
60 |             tableXML.append(cell)
61 |     ## to visualize the detected text areas
62 |     # cv2_imshow("detected cells",imag)
63 |     # cv2.waitKey(0)
64 |     print (tableXML)
65 |     return tableXML
66 | 
67 | # border([111,228,680,480],cv2.imread('cTDaR_t10039.jpg'))


--------------------------------------------------------------------------------
/CascadeTab/main.py:
--------------------------------------------------------------------------------
 1 | from border import border
 2 | from mmdet.apis import inference_detector, show_result, init_detector
 3 | import cv2
 4 | from Functions.blessFunc import borderless
 5 | import lxml.etree as etree
 6 | import glob
 7 | 
 8 | 
 9 | ############ To Do ############
10 | image_path = '/content/drive/My Drive/Optum/Dataset/images/*'
11 | xmlPath = '/content/drive/My Drive/Optum/Dataset/XML file/'
12 | 
13 | config_fname = '/content/drive/My Drive/Optum/CascadeTabNet/Config/cascade_mask_rcnn_hrnetv2p_w32_20e.py' 
14 | checkpoint_path = '/content/drive/My Drive/Optum/Checkpoint/'
15 | epoch = 'epoch_36.pth'
16 | ##############################
17 | 
18 | model = init_detector(config_fname, checkpoint_path+epoch)
19 | 
20 | 
21 | # List of images in the image_path
22 | imgs = glob.glob(image_path)
23 | for i in imgs:
24 | 
25 |     print (i)
26 |     img = cv2.imread(i)
27 |     dimensions = img.shape
28 |     height = img.shape[0]
29 |     width = img.shape[1]
30 |     channels = img.shape[2] 
31 | 
32 |     print('Image Dimension    : ',dimensions)
33 |     
34 |     result = inference_detector(model, i)
35 |     print ("The result is = ",result)
36 | 
37 |     res_border = []
38 |     res_bless = []
39 |     res_cell = []
40 | 
41 |     root = etree.Element("document")
42 | 
43 |     ## for border
44 |     for r in result[0][0]:
45 |         print ("1.",r[4])
46 |         if r[4]>.85:
47 |             res_border.append(r[:4].astype(int))
48 | 
49 |     ## for cells
50 |     for r in result[0][1]:
51 |         print ("2.",r[4])
52 |         if r[4]>.85:
53 |             r[4] = r[4]*100
54 |             res_cell.append(r.astype(int))
55 | 
56 |     ## for borderless
57 |     for r in result[0][2]:
58 |         print ("3.",r[4])
59 |         if r[4]>.85:
60 |             res_bless.append(r[:4].astype(int))
61 | 
62 |     print ("res_border",res_border)
63 |     print ("res_cell",res_cell)
64 |     print ("res_bless",res_bless)
65 |     
66 |     ## if border tables detected 
67 |     if len(res_border) != 0:
68 |         ## call border script for each table in image
69 |         for res in res_border:
70 |             try:
71 |                 root.append(border(res,cv2.imread(i)))  
72 |             except:
73 |                 pass
74 |     if len(res_bless) != 0:
75 |         if len(res_cell) != 0:
76 |             for no,res in enumerate(res_bless):
77 |                 root.append(borderless(res,cv2.imread(i),res_cell))
78 | 
79 |     myfile = open(xmlPath+i.split('/')[-1][:-3]+'xml', "w")
80 |     myfile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
81 |     myfile.write(etree.tostring(root, pretty_print=True,encoding="unicode"))
82 |     myfile.close()


--------------------------------------------------------------------------------
/Data Preparation/Dilation.py:
--------------------------------------------------------------------------------
 1 | # Note: Image name will be stored as "Dilation_OriginalName" to avoid confict
 2 | 
 3 | import cv2
 4 | import glob
 5 | import numpy as np 
 6 | 
 7 | # DEFINE THE PATH
 8 | print ("Entered")
 9 | PATH_TO_DEST = "/home/prakhar/try/Dilated Image/"
10 | PATH_TO_ORIGIAL_IMAGES = "/home/prakhar/PublayNet/val/publaynet/val/"
11 | 
12 | # if the source directory have other files than images, use extenstion of image 
13 | # to get the files ( for example *.png )
14 | img_files = glob.glob(PATH_TO_ORIGIAL_IMAGES+"*.*")
15 | total = len(img_files)
16 | 
17 | # 2x2 Static kernal
18 | kernal = np.ones((2,2),np.uint8)
19 | 
20 | for count,i in enumerate(img_files):
21 |   print (count)
22 |   print (i)	
23 |   image_name = i.split("/")[-1]
24 |   print("Progress : ",count,"/",total)
25 |   img = cv2.imread(i,0)
26 |   _, mask = cv2.threshold(img,220,255,cv2.THRESH_BINARY_INV)
27 |   dst = cv2.dilate(mask,kernal,iterations = 1)
28 |   dst = cv2.bitwise_not(dst)
29 |   cv2.imwrite(PATH_TO_DEST+"/Dilation_"+image_name,dst)
30 | 


--------------------------------------------------------------------------------
/Data Preparation/Images/3img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Prakhar-97/Table-detection-and-Document-layout-analysis/bfcd189ee9edb603b734cd07d965a7400b85f820/Data Preparation/Images/3img.png


--------------------------------------------------------------------------------
/Data Preparation/Smudge.py:
--------------------------------------------------------------------------------
 1 | # Note: Image name will be stored as "Smudge_OriginalName" to avoid confict
 2 | import cv2
 3 | import numpy as np
 4 | import glob
 5 | 
 6 | def basicTransform(img):
 7 | 	_, mask = cv2.threshold(img,220,255,cv2.THRESH_BINARY_INV)
 8 | 	img = cv2.bitwise_not(mask)
 9 | 	return img
10 | 
11 | PATH_TO_DEST = "/home/prakhar/try/Smudged Image/"
12 | PATH_TO_ORIGIAL_IMAGES = "/home/prakhar/PublayNet/val/publaynet/val/"
13 | 
14 | img_files = glob.glob(PATH_TO_ORIGIAL_IMAGES+"*.*")
15 | 
16 | total = len(img_files)
17 | for count,i in enumerate(img_files):
18 |   image_name = i.split("/")[-1]
19 |   print("Progress : ",count,"/",total)
20 |   img = cv2.imread(i)
21 |   
22 |   # Split the 3 channels into Blue,Green and Red
23 |   b,g,r = cv2.split(img)
24 |   
25 |   # Apply Basic Transformation
26 |   b = basicTransform(b)
27 |   r = basicTransform(r)
28 |   g = basicTransform(g)
29 |   
30 |   # Perform the distance transform algorithm
31 |   b = cv2.distanceTransform(b, cv2.DIST_L2, 5)  # ELCUDIAN
32 |   g = cv2.distanceTransform(g, cv2.DIST_L1, 5)  # LINEAR
33 |   r = cv2.distanceTransform(r, cv2.DIST_C, 5)   # MAX
34 | 
35 |   # Normalize
36 |   r = cv2.normalize(r, r, 0, 1.0, cv2.NORM_MINMAX)
37 |   g = cv2.normalize(g, g, 0, 1.0, cv2.NORM_MINMAX)
38 |   b = cv2.normalize(b, b, 0, 1.0, cv2.NORM_MINMAX)
39 | 
40 |   # Merge the channels
41 |   dist = cv2.merge((b,g,r))
42 |   dist = cv2.normalize(dist,dist, 0, 4.0, cv2.NORM_MINMAX)
43 |   dist = cv2.cvtColor(dist, cv2.COLOR_BGR2GRAY)
44 | 
45 |   # In order to save as jpg, or png, we need to handle the Data
46 |   # format of image
47 |   data = dist.astype(np.float64) / 4.0
48 |   data = 1800 * data # Now scale by 1800
49 |   dist = data.astype(np.uint16)
50 | 
51 |   # Save to destination
52 |   cv2.imwrite(PATH_TO_DEST+"/Smudge_"+image_name,dist)
53 | 


--------------------------------------------------------------------------------
/Document layout analysis/Config file/cascade_mask_rcnn_hrnetv2p_w32_20e_coco.py:
--------------------------------------------------------------------------------
  1 | model = dict(
  2 |     type='CascadeRCNN',
  3 |     pretrained='open-mmlab://msra/hrnetv2_w32',
  4 |     backbone=dict(
  5 |         type='HRNet',
  6 |         extra=dict(
  7 |             stage1=dict(
  8 |                 num_modules=1,
  9 |                 num_branches=1,
 10 |                 block='BOTTLENECK',
 11 |                 num_blocks=(4, ),
 12 |                 num_channels=(64, )),
 13 |             stage2=dict(
 14 |                 num_modules=1,
 15 |                 num_branches=2,
 16 |                 block='BASIC',
 17 |                 num_blocks=(4, 4),
 18 |                 num_channels=(32, 64)),
 19 |             stage3=dict(
 20 |                 num_modules=4,
 21 |                 num_branches=3,
 22 |                 block='BASIC',
 23 |                 num_blocks=(4, 4, 4),
 24 |                 num_channels=(32, 64, 128)),
 25 |             stage4=dict(
 26 |                 num_modules=3,
 27 |                 num_branches=4,
 28 |                 block='BASIC',
 29 |                 num_blocks=(4, 4, 4, 4),
 30 |                 num_channels=(32, 64, 128, 256)))),
 31 |     neck=dict(type='HRFPN', in_channels=[32, 64, 128, 256], out_channels=256),
 32 |     rpn_head=dict(
 33 |         type='RPNHead',
 34 |         in_channels=256,
 35 |         feat_channels=256,
 36 |         anchor_generator=dict(
 37 |             type='AnchorGenerator',
 38 |             scales=[8],
 39 |             ratios=[0.5, 1.0, 2.0],
 40 |             strides=[4, 8, 16, 32, 64]),
 41 |         bbox_coder=dict(
 42 |             type='DeltaXYWHBBoxCoder',
 43 |             target_means=[0.0, 0.0, 0.0, 0.0],
 44 |             target_stds=[1.0, 1.0, 1.0, 1.0]),
 45 |         loss_cls=dict(
 46 |             type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
 47 |         loss_bbox=dict(
 48 |             type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=1.0)),
 49 |     roi_head=dict(
 50 |         type='CascadeRoIHead',
 51 |         num_stages=3,
 52 |         stage_loss_weights=[1, 0.5, 0.25],
 53 |         bbox_roi_extractor=dict(
 54 |             type='SingleRoIExtractor',
 55 |             roi_layer=dict(type='RoIAlign', out_size=7, sample_num=0),
 56 |             out_channels=256,
 57 |             featmap_strides=[4, 8, 16, 32]),
 58 |         bbox_head=[
 59 |             dict(
 60 |                 type='Shared2FCBBoxHead',
 61 |                 in_channels=256,
 62 |                 fc_out_channels=1024,
 63 |                 roi_feat_size=7,
 64 |                 num_classes=80,
 65 |                 bbox_coder=dict(
 66 |                     type='DeltaXYWHBBoxCoder',
 67 |                     target_means=[0.0, 0.0, 0.0, 0.0],
 68 |                     target_stds=[0.1, 0.1, 0.2, 0.2]),
 69 |                 reg_class_agnostic=True,
 70 |                 loss_cls=dict(
 71 |                     type='CrossEntropyLoss',
 72 |                     use_sigmoid=False,
 73 |                     loss_weight=1.0),
 74 |                 loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
 75 |                                loss_weight=1.0)),
 76 |             dict(
 77 |                 type='Shared2FCBBoxHead',
 78 |                 in_channels=256,
 79 |                 fc_out_channels=1024,
 80 |                 roi_feat_size=7,
 81 |                 num_classes=80,
 82 |                 bbox_coder=dict(
 83 |                     type='DeltaXYWHBBoxCoder',
 84 |                     target_means=[0.0, 0.0, 0.0, 0.0],
 85 |                     target_stds=[0.05, 0.05, 0.1, 0.1]),
 86 |                 reg_class_agnostic=True,
 87 |                 loss_cls=dict(
 88 |                     type='CrossEntropyLoss',
 89 |                     use_sigmoid=False,
 90 |                     loss_weight=1.0),
 91 |                 loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
 92 |                                loss_weight=1.0)),
 93 |             dict(
 94 |                 type='Shared2FCBBoxHead',
 95 |                 in_channels=256,
 96 |                 fc_out_channels=1024,
 97 |                 roi_feat_size=7,
 98 |                 num_classes=80,
 99 |                 bbox_coder=dict(
100 |                     type='DeltaXYWHBBoxCoder',
101 |                     target_means=[0.0, 0.0, 0.0, 0.0],
102 |                     target_stds=[0.033, 0.033, 0.067, 0.067]),
103 |                 reg_class_agnostic=True,
104 |                 loss_cls=dict(
105 |                     type='CrossEntropyLoss',
106 |                     use_sigmoid=False,
107 |                     loss_weight=1.0),
108 |                 loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
109 |         ],
110 |         mask_roi_extractor=dict(
111 |             type='SingleRoIExtractor',
112 |             roi_layer=dict(type='RoIAlign', out_size=14, sample_num=0),
113 |             out_channels=256,
114 |             featmap_strides=[4, 8, 16, 32]),
115 |         mask_head=dict(
116 |             type='FCNMaskHead',
117 |             num_convs=4,
118 |             in_channels=256,
119 |             conv_out_channels=256,
120 |             num_classes=80,
121 |             loss_mask=dict(
122 |                 type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))))
123 | train_cfg = dict(
124 |     rpn=dict(
125 |         assigner=dict(
126 |             type='MaxIoUAssigner',
127 |             pos_iou_thr=0.7,
128 |             neg_iou_thr=0.3,
129 |             min_pos_iou=0.3,
130 |             match_low_quality=True,
131 |             ignore_iof_thr=-1),
132 |         sampler=dict(
133 |             type='RandomSampler',
134 |             num=256,
135 |             pos_fraction=0.5,
136 |             neg_pos_ub=-1,
137 |             add_gt_as_proposals=False),
138 |         allowed_border=0,
139 |         pos_weight=-1,
140 |         debug=False),
141 |     rpn_proposal=dict(
142 |         nms_across_levels=False,
143 |         nms_pre=2000,
144 |         nms_post=2000,
145 |         max_num=2000,
146 |         nms_thr=0.7,
147 |         min_bbox_size=0),
148 |     rcnn=[
149 |         dict(
150 |             assigner=dict(
151 |                 type='MaxIoUAssigner',
152 |                 pos_iou_thr=0.5,
153 |                 neg_iou_thr=0.5,
154 |                 min_pos_iou=0.5,
155 |                 match_low_quality=False,
156 |                 ignore_iof_thr=-1),
157 |             sampler=dict(
158 |                 type='RandomSampler',
159 |                 num=512,
160 |                 pos_fraction=0.25,
161 |                 neg_pos_ub=-1,
162 |                 add_gt_as_proposals=True),
163 |             mask_size=28,
164 |             pos_weight=-1,
165 |             debug=False),
166 |         dict(
167 |             assigner=dict(
168 |                 type='MaxIoUAssigner',
169 |                 pos_iou_thr=0.6,
170 |                 neg_iou_thr=0.6,
171 |                 min_pos_iou=0.6,
172 |                 match_low_quality=False,
173 |                 ignore_iof_thr=-1),
174 |             sampler=dict(
175 |                 type='RandomSampler',
176 |                 num=512,
177 |                 pos_fraction=0.25,
178 |                 neg_pos_ub=-1,
179 |                 add_gt_as_proposals=True),
180 |             mask_size=28,
181 |             pos_weight=-1,
182 |             debug=False),
183 |         dict(
184 |             assigner=dict(
185 |                 type='MaxIoUAssigner',
186 |                 pos_iou_thr=0.7,
187 |                 neg_iou_thr=0.7,
188 |                 min_pos_iou=0.7,
189 |                 match_low_quality=False,
190 |                 ignore_iof_thr=-1),
191 |             sampler=dict(
192 |                 type='RandomSampler',
193 |                 num=512,
194 |                 pos_fraction=0.25,
195 |                 neg_pos_ub=-1,
196 |                 add_gt_as_proposals=True),
197 |             mask_size=28,
198 |             pos_weight=-1,
199 |             debug=False)
200 |     ])
201 | test_cfg = dict(
202 |     rpn=dict(
203 |         nms_across_levels=False,
204 |         nms_pre=1000,
205 |         nms_post=1000,
206 |         max_num=1000,
207 |         nms_thr=0.7,
208 |         min_bbox_size=0),
209 |     rcnn=dict(
210 |         score_thr=0.05,
211 |         nms=dict(type='nms', iou_thr=0.5),
212 |         max_per_img=100,
213 |         mask_thr_binary=0.5))
214 | dataset_type = 'CocoDataset'
215 | data_root = '/content/drive/My Drive/all_data/'
216 | classes = ('text', 'title', 'list', 'table', 'figure')
217 | img_norm_cfg = dict(
218 |     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
219 | train_pipeline = [
220 |     dict(type='LoadImageFromFile'),
221 |     dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
222 |     dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
223 |     dict(type='RandomFlip', flip_ratio=0.5),
224 |     dict(
225 |         type='Normalize',
226 |         mean=[123.675, 116.28, 103.53],
227 |         std=[58.395, 57.12, 57.375],
228 |         to_rgb=True),
229 |     dict(type='Pad', size_divisor=32),
230 |     dict(type='DefaultFormatBundle'),
231 |     dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
232 | ]
233 | test_pipeline = [
234 |     dict(type='LoadImageFromFile'),
235 |     dict(
236 |         type='MultiScaleFlipAug',
237 |         img_scale=(1333, 800),
238 |         flip=False,
239 |         transforms=[
240 |             dict(type='Resize', keep_ratio=True),
241 |             dict(type='RandomFlip'),
242 |             dict(
243 |                 type='Normalize',
244 |                 mean=[123.675, 116.28, 103.53],
245 |                 std=[58.395, 57.12, 57.375],
246 |                 to_rgb=True),
247 |             dict(type='Pad', size_divisor=32),
248 |             dict(type='ImageToTensor', keys=['img']),
249 |             dict(type='Collect', keys=['img'])
250 |         ])
251 | ]
252 | data = dict(
253 |     samples_per_gpu=2,
254 |     workers_per_gpu=2,
255 |     train=dict(
256 |         type='CocoDataset',
257 |         ann_file=
258 |         '/content/drive/My Drive/all_data/annotations/train_publaynet.json',
259 |         img_prefix='/content/drive/My Drive/all_data/train/',
260 |         classes=('text', 'title', 'list', 'table', 'figure'),
261 |         pipeline=[
262 |             dict(type='LoadImageFromFile'),
263 |             dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
264 |             dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
265 |             dict(type='RandomFlip', flip_ratio=0.5),
266 |             dict(
267 |                 type='Normalize',
268 |                 mean=[123.675, 116.28, 103.53],
269 |                 std=[58.395, 57.12, 57.375],
270 |                 to_rgb=True),
271 |             dict(type='Pad', size_divisor=32),
272 |             dict(type='DefaultFormatBundle'),
273 |             dict(
274 |                 type='Collect',
275 |                 keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
276 |         ]),
277 |     val=dict(
278 |         type='CocoDataset',
279 |         ann_file=
280 |         '/content/drive/My Drive/all_data/annotations/val_publaynet.json',
281 |         img_prefix='/content/drive/My Drive/all_data/validation/',
282 |         classes=('text', 'title', 'list', 'table', 'figure'),
283 |         pipeline=[
284 |             dict(type='LoadImageFromFile'),
285 |             dict(
286 |                 type='MultiScaleFlipAug',
287 |                 img_scale=(1333, 800),
288 |                 flip=False,
289 |                 transforms=[
290 |                     dict(type='Resize', keep_ratio=True),
291 |                     dict(type='RandomFlip'),
292 |                     dict(
293 |                         type='Normalize',
294 |                         mean=[123.675, 116.28, 103.53],
295 |                         std=[58.395, 57.12, 57.375],
296 |                         to_rgb=True),
297 |                     dict(type='Pad', size_divisor=32),
298 |                     dict(type='ImageToTensor', keys=['img']),
299 |                     dict(type='Collect', keys=['img'])
300 |                 ])
301 |         ]),
302 |     test=dict(
303 |         type='CocoDataset',
304 |         ann_file=
305 |         '/content/drive/My Drive/all_data/annotations/val_publaynet.json',
306 |         img_prefix='/content/drive/My Drive/all_data/validation/',
307 |         classes=('text', 'title', 'list', 'table', 'figure'),
308 |         pipeline=[
309 |             dict(type='LoadImageFromFile'),
310 |             dict(
311 |                 type='MultiScaleFlipAug',
312 |                 img_scale=(1333, 800),
313 |                 flip=False,
314 |                 transforms=[
315 |                     dict(type='Resize', keep_ratio=True),
316 |                     dict(type='RandomFlip'),
317 |                     dict(
318 |                         type='Normalize',
319 |                         mean=[123.675, 116.28, 103.53],
320 |                         std=[58.395, 57.12, 57.375],
321 |                         to_rgb=True),
322 |                     dict(type='Pad', size_divisor=32),
323 |                     dict(type='ImageToTensor', keys=['img']),
324 |                     dict(type='Collect', keys=['img'])
325 |                 ])
326 |         ]))
327 | evaluation = dict(interval=1, metric=['bbox', 'segm'])
328 | optimizer = dict(type='SGD', lr=0.001, momentum=0.9, weight_decay=0.0001)
329 | optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
330 | lr_config = dict(
331 |     policy='step',
332 |     warmup='linear',
333 |     warmup_iters=500,
334 |     warmup_ratio=0.3333333333333333,
335 |     step=[16, 19])
336 | total_epochs = 20
337 | checkpoint_config = dict(interval=1, create_symlink=False)
338 | log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
339 | dist_params = dict(backend='nccl')
340 | log_level = 'INFO'
341 | load_from = None
342 | resume_from = '/content/drive/My Drive/mmdetection/tools/work_dirs/cascade_mask_rcnn_hrnetv2p_w32_20e_coco/epoch_9.pth'
343 | workflow = [('train', 1)]
344 | work_dir = './work_dirs/cascade_mask_rcnn_hrnetv2p_w32_20e_coco'
345 | gpu_ids = range(0, 1)
346 | 


--------------------------------------------------------------------------------
/Document layout analysis/ICDAR_XML_to_COCO.py:
--------------------------------------------------------------------------------
  1 | from pdf2image import convert_from_path
  2 | from pdf2image.exceptions import (PDFInfoNotInstalledError, PDFPageCountError,PDFSyntaxError)
  3 | from bs4 import BeautifulSoup as bs
  4 | import glob
  5 | import os
  6 | import cv2
  7 | import numpy as np
  8 | import pprint
  9 | import pickle
 10 | import json
 11 | 
 12 | pdf_path = '/home/prakhar/mmdetection/convert2pdf/all pdfs/*'
 13 | xml_path = '/home/prakhar/mmdetection/convert2pdf/all xmls/*' 
 14 | pdfs = glob.glob(pdf_path)
 15 | xmls = glob.glob(xml_path)
 16 | a = 1
 17 | b = 0
 18 | img_list = []
 19 | ann_list = []
 20 | cate_list = []
 21 | super_dict = {}
 22 | categories = {}
 23 | 
 24 | for i in pdfs:
 25 | 
 26 | 	print (i)
 27 | 	tail_pdf = os.path.split(i)
 28 | 	name_pdf = os.path.splitext(tail_pdf[1])
 29 | 	#print(tail_pdf[1])
 30 | 	#print (name_pdf[0]) 
 31 | 
 32 | 	for j in xmls :
 33 | 		
 34 | 		tail_xml = os.path.split(j)
 35 | 		name_xml = os.path.splitext(tail_xml[1])
 36 | 		#print(tail_xml[1])
 37 | 		#print (name_xml[0]) 
 38 | 
 39 | 		if (name_pdf[0]+"-reg" == name_xml[0]):
 40 | 			pages = convert_from_path(i)
 41 | 
 42 | 			for k, page in enumerate(pages):
 43 | 				
 44 | 				image = {}
 45 | 
 46 | 				fname = name_pdf[0] + "_page_" + str(k+1) + ".png"
 47 | 				print (j)
 48 | 
 49 | 				content = []
 50 | 				with open(j, 'r') as file:
 51 | 					#import pdb ; pdb.set_trace()
 52 | 
 53 | 					content = file.readlines()
 54 | 					content ="".join(content)
 55 | 					bs_content = bs(content, "lxml")
 56 | 
 57 | 					table = bs_content.find_all("region")
 58 | 					#print (table)
 59 | 					coords = []
 60 | 
 61 | 					for p in table:
 62 | 
 63 | 						ann = {}
 64 | 						masks = []
 65 | 						num = p["page"]
 66 | 						#print (num)
 67 | 						#print (k+1)
 68 | 						if num == str(k+1) :
 69 | 							
 70 | 							b = b+1
 71 | 							length = len(p.contents)
 72 | 							bbox = p.contents[length-2]
 73 | 
 74 | 							x1 = int(bbox["x1"])
 75 | 							#print (x1)
 76 | 							y1 = int(bbox["y1"])
 77 | 							w = int(bbox["x2"])-int(bbox["x1"])
 78 | 							#print (w)
 79 | 							h = int(bbox["y2"])-int(bbox["y1"])
 80 | 							#print (h)
 81 | 
 82 | 							coords = [x1, y1, w, h]
 83 | 							mask = [x1, y1, x1, y1+h, x1+w, y1+h, x1+w, y1]
 84 | 							masks.append(mask)
 85 | 
 86 | 							ann["area"] = float(w*h)
 87 | 							ann["bbox"] = coords
 88 | 							ann["segmentation"] = masks
 89 | 							ann["category_id"] = 1
 90 | 							ann["image_id"] = a
 91 | 							ann["id"] = b
 92 | 							ann["iscrowd"] = 0
 93 | 							ann["ignore"] = 0
 94 | 							ann_list.append(ann)
 95 | 
 96 | 					if len(coords) > 0 :
 97 | 						
 98 | 						page.save(fname, "PNG")
 99 | 						
100 | 						img = cv2.imread(fname)
101 | 						dimensions = img.shape
102 | 						height = img.shape[0]
103 | 						width = img.shape[1]
104 | 						channels = img.shape[2] 
105 | 
106 | 						#print('Image Dimension    : ',dimensions)
107 | 						#print('Image Height       : ',height)
108 | 						#print('Image Width        : ',width)
109 | 						#print('Number of Channels : ',channels) 
110 | 
111 | 						image["file_name"] = fname
112 | 						image["width"] = width
113 | 						image["height"] = height
114 | 						image["id"] = a
115 | 						
116 | 						img_list.append(image)
117 | 						a = a+1
118 | 
119 | categories["id"] = 1
120 | categories["name"] = "table"
121 | cate_list.append(categories)
122 | 
123 | super_dict["annotations"] = ann_list
124 | super_dict["categories"] = cate_list
125 | super_dict["images"] = img_list
126 | super_dict["type"] = "instances"
127 | 
128 | filename = 'dataset'
129 | outfile = open(filename,'wb')
130 | pickle.dump(super_dict, outfile)
131 | outfile.close()
132 | 
133 | with open("dataset.json", 'w') as outfile:
134 | 	json.dump(super_dict, outfile)
135 | #print (pickled_object)
136 | #unpickled_object = pickle.load(open(filename, 'rb'))
137 | #print (unpickled_object)
138 | 
139 | #a = CustomDataset(pickle.loads(pickled_object))
140 | pp = pprint.PrettyPrinter(indent=4)									
141 | pp.pprint (super_dict)
142 |   


--------------------------------------------------------------------------------
/Document layout analysis/main.py:
--------------------------------------------------------------------------------
 1 | from mmdet.apis import inference_detector, show_result_pyplot, init_detector
 2 | from mmdet.core import encode_mask_results, tensor2imgs
 3 | import cv2
 4 | import os 
 5 | 
 6 | ###################################################  TO DO  ###################################################
 7 | image_pth = 'Give the image path'
 8 | 
 9 | config_fname = "Give the config file path "
10 | checkpoint_path = 'Give the checkpoint file path'
11 | epoch = 'epoch_6.pth'
12 | 
13 | #############################################################################################################
14 | 
15 | model = init_detector(config_fname, checkpoint_path+epoch)
16 | img = cv2.imread(image_pth)
17 | 
18 | result = inference_detector(model, img)
19 | #print ("The result is = ",result)
20 | 
21 | results = []
22 | bbox_results, mask_results = result
23 | 
24 | res_text= []
25 | res_title = []
26 | res_list = []
27 | res_table = []
28 | res_figure = []
29 | all_classes = []
30 | 
31 | #for text
32 | for r in bbox_results[0]:
33 |     if r[4]>.85:
34 |       res_text.append(r[:4].astype(int))
35 |    
36 | print ("No. of paragraphs on the page are == ",len(res_text))
37 | all_classes.append(res_text)
38 | 
39 | #for title
40 | for r in bbox_results[1]:
41 |     if r[4]>.85:
42 |       res_title.append(r[:4].astype(int))
43 | 
44 | print ("No. of headers on the page are == ",len(res_title))
45 | all_classes.append(res_title)
46 | 
47 | #for list
48 | for r in bbox_results[2]:
49 |     if r[4]>.85:
50 |       res_list.append(r[:4].astype(int))
51 | 
52 | print ("No. of lists on the page are == ",len(res_list))
53 | all_classes.append(res_list)
54 | 
55 | #for table
56 | for r in bbox_results[3]:
57 |     if r[4]>.85:
58 |       res_table.append(r[:4].astype(int))
59 | 
60 | print ("No. of the tables on the page are == ",len(res_table))
61 | all_classes.append(res_table)
62 | 
63 | #for figure
64 | for r in bbox_results[4]:
65 |     if r[4]>.85:
66 |       res_figure.append(r[:4].astype(int))
67 | 
68 | print ("No. of figures on the page are == ",len(res_figure))
69 | all_classes.append(res_figure)
70 | 
71 | im2 = img.copy()
72 | for count, category in enumerate(all_classes):
73 |     #print ("The no. of bbox in these classes are == ",len(category))
74 |     im1 = img.copy()
75 |     colors = [(55,255,20), (0,0,255), (132,240,255), (0,247,255), (2,2,105)]
76 |     filename = ["paragraph_boxes.jpg", "header_boxes.jpg", "list_boxes.jpg", "tabel_boxes.jpg", "figure_boxes.jpg"]
77 | 
78 |     for box in category :
79 |         #print (count)
80 |         #print(colors[count]) 
81 |         cv2.rectangle(im1, (box[0], box[1]), (box[2], box[3]), colors[count], 2)
82 |         cv2.rectangle(im2, (box[0], box[1]), (box[2], box[3]), colors[count], 2)
83 | 
84 |     directory = '/content/drive/My Drive/results'
85 |     os.chdir(directory)
86 |     name = filename[count]
87 |     #print (name)    
88 |     cv2.imwrite(name, im1)
89 | 
90 | directory = '/content/drive/My Drive/results'
91 | os.chdir(directory)
92 | result_file = "all_annotations.jpg"
93 | cv2.imwrite(result_file, im2)
94 | 
95 | encoded_mask_results = encode_mask_results(mask_results)
96 | print ("Encoded mask results are ==  ",encoded_mask_results)
97 | result = bbox_results, encoded_mask_results
98 | 
99 | results.append(result)


--------------------------------------------------------------------------------
/Document layout analysis/test_train_split.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sklearn
 3 | import os
 4 | import glob
 5 | import pprint
 6 | import shutil
 7 | 
 8 | img_source_dir = '/home/prakhar/Publaynet/Original_data'
 9 | train = '/home/prakhar/Publaynet/train'
10 | val = '/home/prakhar/Publaynet/validation'
11 | subdirs = []
12 | ratio = 0.7
13 | for subdir in os.listdir(img_source_dir):
14 | 
15 |     print (subdir)
16 |     a = os.path.join(img_source_dir, subdir)
17 |     subdirs.append(a)
18 | 
19 | print (subdirs)
20 | 
21 | elements = len(subdirs)
22 | middle = int(elements*ratio)
23 | 
24 | train_list = subdirs[:middle]
25 | val_list = subdirs[middle:]
26 | 
27 | for f in train_list:
28 |     shutil.move(f, train)
29 | 
30 | for f in val_list:
31 |     shutil.move(f, val)
32 | 
33 | 
34 | train_path = '/home/prakhar/Publaynet/train/*'
35 | val_path = '/home/prakhar/Publaynet/validation/*'
36 | 
37 | train_imgs = glob.glob(train_path)
38 | #print (train_imgs)
39 | val_imgs = glob.glob(val_path)
40 | #print (val_imgs)
41 | 
42 | with open('/home/prakhar/Publaynet/Labels/val.json') as f:
43 | 	data = json.load(f)
44 | 
45 | #pp = pprint.PrettyPrinter(indent=4)									
46 | #pp.pprint (data)
47 | 
48 | def create_dict(imgs, data):
49 | 
50 | 	train_ann = []
51 | 	name = []
52 | 	super_dict = {}
53 | 	total = len(imgs)
54 | 
55 | 	for count,i in enumerate(imgs):
56 | 
57 | 		print("Progress : ",count,"/",total)	
58 | 		image_name = os.path.split(i)
59 | 		name_list = data["images"]
60 | 
61 | 		for j in name_list:
62 | 
63 | 			if j["file_name"] == image_name[1]:
64 | 
65 | 				num = j["id"]
66 | 				ann = data["annotations"]
67 | 				name.append(j)
68 | 
69 | 				for k in ann:
70 | 
71 | 					if k["image_id"] == num:
72 | 
73 | 						train_ann.append(k)
74 | 				
75 | 	super_dict["annotations"] = train_ann
76 | 	super_dict["images"] = name
77 | 	super_dict["categories"] = data["categories"]
78 | 	
79 | 	return (super_dict)
80 | 
81 | print ("For train")
82 | train_dict = create_dict(train_imgs, data)
83 | pp = pprint.PrettyPrinter(indent=4)									
84 | pp.pprint (train_dict)
85 | with open("train_publaynet.json", 'w') as outfile:
86 | 	json.dump(train_dict, outfile)
87 | 
88 | print ("For val")
89 | val_dict = create_dict(val_imgs, data)
90 | pp = pprint.PrettyPrinter(indent=4)									
91 | pp.pprint (val_dict)
92 | with open("val_publaynet.json", 'w') as f:
93 | 	json.dump(val_dict, f)
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Prakhar-97
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Table-detection-and-Document-layout-analysis
 2 | ## Introduction
 3 | Using State of the Art techniques for table detection and Document layout analysis. For table detection we are using MMDetection version(1.2), however in Document layout analysis we are using the models which have been developed in MMDetection version(2.0)
 4 |  
 5 | ## Setup
 6 | <b>Models are developed in Pytorch based <a href="https://github.com/open-mmlab/mmdetection">MMdetection</a> framework (Version 2.0)</b>
 7 | <br>
 8 | 
 9 | <pre>
10 | git clone -'https://github.com/open-mmlab/mmdetection.git'
11 | cd "mmdetection"
12 | python setup.py install
13 | python setup.py develop
14 | pip install -r {"requirements.txt"}
15 | </pre>
16 | 
17 | ## Image Augmentation
18 | We have followed Dilation and Smudge techniques for Data Augmentation
19 | 
20 | <img src="Data Preparation/Images/3img.png" width="750"/><br>
21 | 
22 | 
23 | ## Model Zoo
24 | Config file for the Models :
25 | 
26 | 
27 | 1. For table detection
28 | <a href="CascadeTab/Config/cascade_mask_rcnn_hrnetv2p_w32_20e.py/">Config_file</a><br>
29 | 
30 | 2. For Document Analysis
31 | <a href="Document layout analysis/Config/cascade_mask_rcnn_hrnetv2p_w32_20e.py">Config_file</a><br>
32 | 
33 | Note: Config paths are only required to change during training
34 | 
35 | Checkpoints of the Models that have been trained : 
36 | 
37 | <table>
38 |   <tr>
39 |   <th>Model Name</th><th>Checkpoint File</th>
40 |   </tr>
41 |   <tr>
42 |   <td>Table structure recognition</td><td><a href="https://drive.google.com/open?id=1-vjfGRhF8kqvKwZPPFNwiTaOoonJlGgv">Checkpoint</a></td>
43 |   </tr>
44 |   <tr>
45 |   <td>Document layout analysis</td><td><a href="https://drive.google.com/file/d/1TGMMdk9WDY_xOqb3IrD0G1DzncMiAP0T/view?usp=sharing">Checkpoint</a></td>
46 |   </tr>
47 | </table>
48 | 
49 | ## Datasets
50 | 1. Table detection and Structure Recignition:
51 | You can refer to <a href="https://github.com/DevashishPrasad/CascadeTabNet">Dataset</a> to have a better understanding of the Dataset
52 | 
53 | 2. Document layout Analysis:
54 | You can refer to <a href="https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/PubLayNet.html">Dataset</a> to have a better understanding of the dataset.
55 | 
56 | ## Training
57 | 
58 | Refer to the two colab notebooks thathave been mentioned as they will direct you through the steps that need to be followed. If using a custom dataset do go through <a href="https://mmdetection.readthedocs.io/en/latest/">MMdet Docs</a>
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/literature-survey.md:
--------------------------------------------------------------------------------
 1 | # Document layout analysis
 2 | 
 3 | ## Datasets
 4 | [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/)
 5 | * 37 GB
 6 | * image classification with 16 classes  
 7 | 
 8 | [DocBank](https://arxiv.org/pdf/2006.01038v1.pdf)
 9 | * Yet to be released
10 | * author, footer, section, title, abstract, list, paragraph, reference, caption, equation, figure, table
11 | 
12 | Other PubLayNet implementations  
13 | * [with torch's maskrcnn](https://github.com/phamquiluan/publaynet)
14 | * [with detectron](https://github.com/hpanwar08/detectron2)
15 | 


--------------------------------------------------------------------------------