├── README.md ├── correctPerspective.py ├── example output ├── graph.jpg ├── graph.py ├── invoice_images ├── Sample1 │ ├── 1.jpg │ ├── 2.jpg │ └── 3.jpg ├── Sample10 │ └── 1.jpg ├── Sample11 │ └── 1.jpg ├── Sample12 │ └── 1.jpg ├── Sample13 │ └── 1.jpg ├── Sample14 │ └── 1.jpg ├── Sample15 │ └── 1.jpg ├── Sample16 │ └── 1.jpg ├── Sample17 │ └── 1.jpg ├── Sample18 │ └── 1.jpg ├── Sample19 │ └── 1.jpg ├── Sample2 │ └── 1.jpg ├── Sample20 │ └── 1.jpg ├── Sample21 │ └── 1.jpg ├── Sample22 │ └── 1.jpg ├── Sample23 │ └── 1.jpg ├── Sample24 │ └── 1.jpg ├── Sample25 │ └── 1.jpg ├── Sample3 │ └── 1.jpg ├── Sample4 │ ├── 1.jpg │ ├── 2.jpg │ └── 3.jpg ├── Sample5 │ └── 1.jpg ├── Sample6 │ └── 1.jpg ├── Sample7 │ └── 1.jpg ├── Sample8 │ └── 1.jpg ├── Sample9 │ └── 1.jpg ├── X_lines.jpg ├── Y_lines.jpg ├── dilated.jpg ├── graph.jpg ├── lines_removed.jpg ├── output.csv ├── output.png └── sample_output_original.png ├── label_synonyms.csv ├── labels.csv ├── merge_boxes.py ├── pdf_to_images.py ├── rem_lines.py └── text_detector.py /README.md: -------------------------------------------------------------------------------- 1 | # invoice-extractor 2 | A python implementation to extract data in structured form from an image of an invoice 3 | 4 | # Flow: 5 | ## original invoice 6 | ![alt text](https://github.com/piyushmathur17/invoice-extractor/blob/master/invoice_images/sample_output_original.png) 7 | 8 | ## preprocessing 9 | 10 | ### removing lines 11 | this is being done to accurately detect text contours 12 | 13 | ### mask obtained for vertical and horizontal lines 14 | ![alt text](https://github.com/piyushmathur17/invoice-extractor/blob/master/invoice_images/X_lines.jpg) 15 | 16 | ![alt text](https://github.com/piyushmathur17/invoice-extractor/blob/master/invoice_images/Y_lines.jpg) 17 | 18 | ### after applying mask 19 | ![alt text](https://github.com/piyushmathur17/invoice-extractor/blob/master/invoice_images/lines_removed.jpg) 20 | 21 | ## Obtained graph 22 | ![alt text](https://github.com/piyushmathur17/invoice-extractor/blob/master/invoice_images/graph.jpg) 23 | after getting contours and merging them on the basis of their size and nearness 24 | *the red boxes are the identified keyfields 25 | the keyfields can be changes according to keywords given in labels.csv and label_synonnyms.csv 26 | *green boxes are the values 27 | *relation between the keyfields and it's possible values is shown by using straight lines 28 | 29 | # Output csv 30 | ![alt text](https://github.com/piyushmathur17/invoice-extractor/blob/master/invoice_images/output.png) 31 | 32 | -------------------------------------------------------------------------------- /correctPerspective.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import math 4 | 5 | 6 | def rotate_image(image, angle): 7 | # Grab the dimensions of the image and then determine the center 8 | (h, w) = image.shape[:2] 9 | (cX, cY) = (w / 2, h / 2) 10 | 11 | # grab the rotation matrix (applying the negative of the 12 | # angle to rotate clockwise), then grab the sine and cosine 13 | # (i.e., the rotation components of the matrix) 14 | M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0) 15 | cos = np.abs(M[0, 0]) 16 | sin = np.abs(M[0, 1]) 17 | 18 | # Compute the new bounding dimensions of the image 19 | nW = int((h * sin) + (w * cos)) 20 | nH = int((h * cos) + (w * sin)) 21 | 22 | # Adjust the rotation matrix to take into account translation 23 | M[0, 2] += (nW / 2) - cX 24 | M[1, 2] += (nH / 2) - cY 25 | 26 | # Perform the actual rotation and return the image 27 | return cv2.warpAffine(image, M, (nW, nH)) 28 | 29 | 30 | def getAngle(img): 31 | gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 32 | ret, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV) 33 | ret = min(255, int(1.5*ret)) 34 | x = bw.copy() 35 | y = bw.copy() 36 | 37 | # Appplying dilation on vertical lines 38 | rows = x.shape[0] 39 | kernel = np.ones((3, 3), np.uint8) 40 | vertical_size = rows / 100 41 | vertical_size = int(vertical_size) 42 | 43 | # Create structure element for extracting vertical lines through morphology operations 44 | print("defining vertical lines...", flush=True) 45 | verticalStructure = cv2.getStructuringElement( 46 | cv2.MORPH_RECT, (1, vertical_size)) 47 | 48 | # Apply morphology operations 49 | x = cv2.erode(x, verticalStructure) 50 | x = cv2.dilate(x, verticalStructure) 51 | print("writing ...") 52 | cv2.imwrite('X.png', x) 53 | minLineLength = 200 54 | maxLineGap = 20 55 | print("extracting vertical lines...", flush=True) 56 | #ines=[] 57 | lines = cv2.HoughLinesP(x, 1, np.pi/180, 60, minLineLength, maxLineGap) 58 | print(lines) 59 | angle = 0.0 60 | val = 0 61 | #if lines!=[]: 62 | for line in lines: 63 | for x1, y1, x2, y2 in line: 64 | if(y1= 80: 10 | pos_edge = 1 11 | if (int(rect[y][x][0])+int(rect[y][x][1])+int(rect[y][x][2]) - int(rect[y-2][x][0])-int(rect[y-2][x][1])-int(rect[y-2][x][2])) / 2 <= -80: 12 | neg_edge = 1 13 | if(pos_edge and neg_edge): 14 | print("line detected between ", y1, " ", y2) 15 | return True 16 | return False 17 | 18 | def levenshtein_ratio_and_distance(s, t, ratio_calc=True): 19 | """ levenshtein_ratio_and_distance: 20 | Calculates levenshtein distance between two strings. 21 | If ratio_calc = True, the function computes the 22 | levenshtein distance ratio of similarity between two strings 23 | For all i and j, distance[i,j] will contain the Levenshtein 24 | distance between the first i characters of s and the 25 | first j characters of t 26 | """ 27 | # Initialize matrix of zeros 28 | rows = len(s)+1 29 | cols = len(t)+1 30 | distance = np.zeros((rows, cols), dtype=int) 31 | 32 | # Populate matrix of zeros with the indeces of each character of both strings 33 | for i in range(1, rows): 34 | for k in range(1, cols): 35 | distance[i][0] = i 36 | distance[0][k] = k 37 | 38 | # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions 39 | for col in range(1, cols): 40 | for row in range(1, rows): 41 | if s[row-1] == t[col-1]: 42 | # If the characters are the same in the two strings in a given position [i,j] then the cost is 0 43 | cost = 0 44 | else: 45 | # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio 46 | # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1. 47 | if ratio_calc == True: 48 | cost = 2 49 | else: 50 | cost = 1 51 | distance[row][col] = min(distance[row-1][col] + 1, # Cost of deletions 52 | # Cost of insertions 53 | distance[row][col-1] + 1, 54 | distance[row-1][col-1] + cost) # Cost of substitutions 55 | if ratio_calc == True: 56 | # Computation of the Levenshtein Distance Ratio 57 | Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t)) 58 | return Ratio 59 | else: 60 | # print(distance) 61 | # insertions and/or substitutions 62 | # This is the minimum number of edits needed to convert string a to string b 63 | return distance[row][col] 64 | 65 | 66 | class node: 67 | def __init__(self, i,x1,x2,y1,y2, text_val): 68 | self.i = i 69 | #x1,x2,y1,y2 70 | self.x1 = x1 71 | self.x2 = x2 72 | self.y1 = y1 73 | self.y2 = y2 74 | #edges store the nodes this node is connected to 75 | #they are integers denoting node number 76 | self.edges = [] 77 | #parent of current node 78 | self.parent = -1 79 | #is node a key 80 | self.is_key = False 81 | #stores column number it is a part of 82 | self.column = -1 83 | #stores node present below it 84 | self.down = -1 85 | #stores node present to it's right 86 | self.right = -1 87 | self.left = -1 88 | #used in unused function 89 | self.parents = [] #stores all parent in path to current node 90 | self.table_number = -1 91 | self.table_col = -1 92 | self.table_row = -1 93 | self.text_value = text_val 94 | self.match_percent = 0 95 | 96 | #check if two nodes are overlapping 97 | def overlapping(x1,x2,y1,y2,a1,a2,b1,b2): 98 | if(x2<=a1 or x1>=a2): return False 99 | elif(y2<=b1 or y1>=b2):return False 100 | else : return True 101 | #check if two nodes are verically above and below (if one node's 80% lies within other node's width) 102 | #return value 0 : not overlapping 103 | #return value 1 : overlapping but not 70% 104 | #return value 2 : overlapping and >=70% 105 | def x_overlapping(x1,x2,y1,y2,a1,a2,b1,b2): 106 | if(x2<=a1 or x1>=a2): return 0 107 | if(abs(x1-a1)>200):return 0 108 | else: 109 | l = max(x1,a1) 110 | r = min(x2,a2) 111 | if((r-l)/(x2-x1) >= 0.7):return 2 112 | if((r-l)/(a2-a1) >= 0.7):return 2 113 | return 1 114 | 115 | 116 | #unused function 117 | def make_edges(img,key,pos,contours,graph,node_map,key_fields,Nodes): 118 | print("making edges") 119 | parent = contours[key][pos] 120 | i=pos+1 121 | #creating edges between parent and nodes to the right 122 | while not(i>=len(contours[key]) or ( contours[key][i] in key_fields) ): 123 | graph[parent][contours[key][i]] = True 124 | graph[contours[key][i]][parent] = True 125 | Nodes[parent].edges.append(contours[key][i]) 126 | Nodes[contours[key][i]].parents.append(parent) 127 | x1= node_map[parent][0] 128 | x2= node_map[contours[key][i]][0] 129 | y1= node_map[parent][1] 130 | y2= node_map[contours[key][i]][1] 131 | cv2.line(img,(x1,y1),(x2,y2),(204,100,0),2) 132 | i+=1 133 | #make edges between parent and nodes down below 134 | return img 135 | 136 | #unused function 137 | def make_columns(img,contours,node_map,graph,key_fields,Nodes,node_to_key): 138 | for field in key_fields: 139 | x1 = Nodes[field].x1 140 | x2 = Nodes[field].x2 141 | Nodes[field].is_key = True 142 | for row in contours: 143 | if row <= node_to_key[field]: continue 144 | for row_node in contours[row]: 145 | a = node_map[row_node][0] 146 | b = node_map[row_node][0] + a 147 | if ( not( b<=x1 or a>=x2 ) ): 148 | #if abs(x1-a)<200 and Nodes[field].column == Nodes[row_node].column: 149 | #find node in Nodes corresponding to current element row 150 | graph[field][row_node] = True 151 | graph[row_node][field] = True 152 | Nodes[field].edges.append(row_node) 153 | Nodes[row_node].parents.append(field) 154 | a1= node_map[field][0] 155 | a2= node_map[row_node][0] 156 | b1= node_map[field][1] 157 | b2= node_map[row_node][1] 158 | cv2.line(img,(a1,b1),(a2,b2),(167,88,162),2) 159 | 160 | #look at this function carefully 161 | def dfs(img,row,column,contours,Nodes,go_down,go_right,parent,root): 162 | #contours is a dict with key as row numbers (0,1,2,...,total rows-1) 163 | #eg. { 0:[0,1,2], 1:[3,4,5,6], 2:[7,8,9]} when image has 3 rows and 10 nodes 164 | #row is the key/row number 165 | #column is index of current node in list contours[row] 166 | #Nodes is a list of all node objects 167 | #go_down specifies whether to look for node below 168 | #go_right species whther to look for node to the right 169 | #parent is immediate parent node of current node (parent would be at the left or at top of current node) 170 | #root is the first node of the path of current node(should be a keyfield node) 171 | if(row>= len(contours) or column>= len(contours[row])):return 172 | current_node = contours[row][column] 173 | x1=Nodes[current_node].x1 174 | x2=Nodes[current_node].x2 175 | y1=Nodes[current_node].y1 176 | y2=Nodes[current_node].y2 177 | 178 | #look right if node is not last in row and next node is not a key 179 | if(go_right and column= maxi): 285 | maxi = count 286 | key_row = row 287 | 288 | #we get the table start here, this row is the header 289 | print(key_row) 290 | y1 = Nodes[contours_as_nodes[key_row][0]].y1 291 | for j in range(0,len(contours_as_nodes[key_row])): 292 | Nodes[contours_as_nodes[key_row][j]].is_key=True 293 | Nodes[contours_as_nodes[key_row][j]].table_number = table_count 294 | Nodes[contours_as_nodes[key_row][j]].table_col = j 295 | Nodes[contours_as_nodes[key_row][j]].table_row = 0 296 | Nodes[contours_as_nodes[key_row][j]].text_value = Nodes[contours_as_nodes[key_row][j]].text_value.upper() 297 | 298 | num_fields = 0 299 | start_val = key_row + 1 300 | 301 | #now we'll check next row if there is any key (i.e. there could be subcategories) 302 | text_blobs = 0 303 | key_blobs = 0 304 | for j in range(0,len(contours_as_nodes[key_row+1])): 305 | if(Nodes[contours_as_nodes[key_row+1][j]].is_key): 306 | key_blobs+=1 307 | else : text_blobs+=1 308 | Nodes[contours_as_nodes[key_row][j]].table_number = table_count 309 | 310 | if(key_blobs>=text_blobs): 311 | for j in range(0,len(contours_as_nodes[key_row+1])): 312 | Nodes[contours_as_nodes[key_row+1][j]].is_key=True 313 | start_val += 1 314 | y2 = Nodes[contours_as_nodes[start_val][0]].y1 315 | while(not detect_line(img, Nodes[contours_as_nodes[start_val][1]].x1, y1, y2)): 316 | y1 = y2 317 | start_val += 1 318 | y2 = Nodes[contours_as_nodes[start_val][0]].y1 319 | 320 | num_fields = len(contours_as_nodes[start_val]) 321 | 322 | table_val_stop = start_val 323 | table_end = table_val_stop 324 | table_row_count = 0 325 | #now we gonna make sure no value inside the table is a key (we'll spare first column tho) 326 | for i in range(start_val, len(contours_as_nodes)): 327 | table_row_count+=1 328 | if(abs(len(contours_as_nodes[i])-num_fields)>1): 329 | table_val_stop = i-1 330 | table_count += 1 331 | break 332 | Nodes[contours_as_nodes[i][0]].table_col = 0 333 | Nodes[contours_as_nodes[i][0]].table_row = table_row_count 334 | Nodes[contours_as_nodes[i][0]].table_number = table_count 335 | 336 | for j in range(1,len(contours_as_nodes[i])): 337 | Nodes[contours_as_nodes[i][j]].is_key=False 338 | Nodes[contours_as_nodes[i][j]].table_col = j 339 | Nodes[contours_as_nodes[i][j]].table_row = table_row_count 340 | Nodes[contours_as_nodes[i][j]].table_number = table_count 341 | 342 | #now we gotta search if there is a total field that can be used 343 | #we need keys value for this thing 344 | #matching for column has to be done using coordinates 345 | 346 | flag = True 347 | cnt = 0 348 | for i in range(table_val_stop + 1, min(table_val_stop+5,len(contours_as_nodes))): 349 | col = 0 350 | cnt += 1 351 | for j in range(0, len(contours_as_nodes[i])): 352 | if('total' in Nodes[contours_as_nodes[i][j]].text_value.lower() and Nodes[contours_as_nodes[i][j]].is_key): 353 | # map values based on x coordinates 354 | table_end = i 355 | col = j 356 | Nodes[contours_as_nodes[i][j]].table_number = table_count-1 357 | # Nodes[contours_as_nodes[i][j]].table_col = col 358 | # Nodes[contours_as_nodes[i][j]].table_row = Nodes[contours_as_nodes[table_val_stop][k]].table_row + cnt 359 | 360 | break 361 | if(table_end == i): 362 | for j in range(col, len(contours_as_nodes[i])): 363 | b1 = Nodes[contours_as_nodes[i][j]] 364 | for k in range(0, len(contours_as_nodes[table_val_stop])): 365 | b2 = Nodes[contours_as_nodes[table_val_stop][k]] 366 | if(x_overlapping(b1.x1, b1.x2, b1.y1, b1.y2, b2.x1, b2.x2, b2.y1, b2.y2) > 0): 367 | Nodes[contours_as_nodes[i][j]].table_col = Nodes[contours_as_nodes[table_val_stop][k]].table_col 368 | Nodes[contours_as_nodes[i][j]].table_row = Nodes[contours_as_nodes[table_val_stop][k]].table_row + cnt 369 | break 370 | 371 | for j in range(table_end-1,table_val_stop,-1): 372 | cnt-=1 373 | for k in range(0, len(contours_as_nodes[j])): 374 | Nodes[contours_as_nodes[j][k]].table_row = Nodes[contours_as_nodes[table_val_stop][k]].table_row + cnt 375 | Nodes[contours_as_nodes[j][k]].table_number = Nodes[contours_as_nodes[table_val_stop][k]].table_number 376 | b1 = Nodes[contours_as_nodes[j][k]] 377 | for l in range(0,len(contours_as_nodes[table_val_stop])): 378 | b2 = Nodes[contours_as_nodes[table_val_stop][l]] 379 | if(x_overlapping(b1.x1, b1.x2, b1.y1, b1.y2, b2.x1, b2.x2, b2.y1, b2.y2) > 0): 380 | Nodes[contours_as_nodes[j][k]].table_col = Nodes[contours_as_nodes[table_val_stop][l]].table_col 381 | break 382 | break 383 | 384 | print("Print Vals") 385 | print(key_row, table_val_stop, start_val) 386 | table_extract = [ [""]*(num_fields+5) ] 387 | for i in range(0, table_end - key_row +3): 388 | table_extract.append([""]*num_fields) 389 | for i in range(key_row, table_end+1): 390 | for j in range(0, len(contours_as_nodes[i])): 391 | # print(Nodes[contours_as_nodes[i][j]].text_value, Nodes[contours_as_nodes[i][j]].table_row, Nodes[contours_as_nodes[i][j]].table_col) 392 | table_extract[Nodes[contours_as_nodes[i][j]].table_row][Nodes[contours_as_nodes[i][j]].table_col] = Nodes[contours_as_nodes[i][j]].text_value 393 | if(Nodes[contours_as_nodes[i][j]].is_key): 394 | table_extract[Nodes[contours_as_nodes[i][j]].table_row][Nodes[contours_as_nodes[i][j] 395 | ].table_col] = table_extract[Nodes[contours_as_nodes[i][j]].table_row][Nodes[contours_as_nodes[i][j]].table_col].upper() 396 | print(table_extract) 397 | import csv 398 | 399 | for keyfield in key_fields: 400 | row = node_to_row[keyfield] 401 | for i in range(0,len(contours_as_nodes[row]) ): 402 | if contours_as_nodes[row][i] == keyfield: 403 | column = i 404 | break 405 | root = contours_as_nodes[row][column] 406 | parent = contours_as_nodes[row][column] 407 | dfs(img,row,column,contours_as_nodes,Nodes,True,True,parent,root) 408 | #img = make_edges(img,key,i,contours_as_nodes,graph,node_map,key_fields,Nodes) 409 | #assign_columns(img,column_contours,Nodes) 410 | #for i in Nodes: 411 | #print(i.column) 412 | #make_columns(img,contours_as_nodes,node_map,graph,key_fields,Nodes,node_to_row) 413 | 414 | for i in Nodes: 415 | if i.table_number != -1: 416 | if(Nodes[i.down].left != -1 and Nodes[Nodes[i.down].left].is_key): 417 | i.down = -1 418 | 419 | key_match = find_label(synonyms, key_fields, Nodes) 420 | print(key_match) 421 | for i in key_match: 422 | print(labels[i], Nodes[int(key_match[i])].text_value) 423 | data = extract(labels, key_match, Nodes) 424 | with open('output.csv', 'w', newline="") as csv_file: 425 | writer = csv.writer(csv_file) 426 | for key, value in data.items(): 427 | writer.writerow([key.upper(), value]) 428 | 429 | with open('output.csv', 'a+', newline="") as csv_file: 430 | writer = csv.writer(csv_file) 431 | for i in range(0,2): 432 | writer.writerow('') 433 | 434 | 435 | with open("output.csv", "a+", newline="") as f: 436 | writer = csv.writer(f) 437 | writer.writerows(table_extract) 438 | 439 | 440 | # print(ppp) 441 | cv2.imwrite("graph.jpg",img ) 442 | 443 | 444 | def find_label(synonyms, detected_fields, Nodes): 445 | #key_match is a dict with key= label no. and value = node number that matches that key 446 | key_match = {} 447 | prepositions = ['the', 'of', 'a', 'in', 'an', 'is', 'on'] 448 | for i in detected_fields: 449 | # separate key from value if in same node 450 | words = Nodes[i].text_value.lower() 451 | words = words.split(' ') 452 | node_words = [] 453 | 454 | for j in words: 455 | if j not in prepositions: 456 | node_words.append(j) 457 | 458 | for label in synonyms: 459 | words = synonyms[label] 460 | for p in words: 461 | p=p.lower() 462 | words = p.split(' ') 463 | synonym_words = [] 464 | word_count = 0 465 | match_count = 0 466 | for j in words: 467 | if j not in prepositions and len(j)>=2: 468 | word_count += 1 469 | for k in node_words: 470 | if(k == j or (len(k) > 2 and levenshtein_ratio_and_distance(k, j) > 0.8)): 471 | match_count += 1 472 | percent = 0 473 | if(word_count>0):percent = (match_count/word_count) 474 | if Nodes[i].match_percent < percent: 475 | Nodes[i].match_percent = percent 476 | if(percent>0.7): 477 | key_match[label] = i 478 | return key_match 479 | 480 | 481 | def extract_non_table(labels, keymatch, Nodes, cur_node, label, vis): 482 | if cur_node == -1: 483 | return "" 484 | if vis[cur_node]: 485 | return "" 486 | vis[cur_node] = True 487 | a="" 488 | if cur_node not in vis: 489 | a = Nodes[cur_node].text_value + \ 490 | extract_non_table(labels, keymatch, Nodes, 491 | Nodes[cur_node].down, label, vis) 492 | return a 493 | 494 | 495 | def get_key(val, key_match): 496 | for key, value in key_match.items(): 497 | # print(val, value) 498 | if val == value: 499 | return key 500 | return "-1" 501 | 502 | def extract(labels, key_match, Nodes): 503 | vis = np.zeros((len(Nodes)), dtype=bool) 504 | data = {} 505 | for i in range(0, len(Nodes)): 506 | if Nodes[i].is_key and Nodes[i].table_number==-1: 507 | # print(str(Nodes[i])) 508 | label = get_key(str(i),key_match) 509 | if label == "-1": 510 | label = Nodes[i].text_value 511 | # print ("-1") 512 | else : 513 | label = labels[label] 514 | # print(label) 515 | # data[label] = "" 516 | a = "" 517 | a = extract_non_table(labels, key_match, Nodes, 518 | Nodes[i].right, label, vis) 519 | a += extract_non_table(labels, key_match, Nodes, 520 | Nodes[i].down, label, vis) 521 | if(':' in label): 522 | kk, vv = label.split(':',1) 523 | data[kk] = vv 524 | continue 525 | if(len(a)>0): 526 | data[label]=a 527 | return data 528 | 529 | -------------------------------------------------------------------------------- /invoice_images/Sample1/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample1/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample1/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample1/2.jpg -------------------------------------------------------------------------------- /invoice_images/Sample1/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample1/3.jpg -------------------------------------------------------------------------------- /invoice_images/Sample10/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample10/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample11/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample11/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample12/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample12/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample13/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample13/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample14/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample14/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample15/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample15/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample16/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample16/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample17/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample17/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample18/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample18/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample19/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample19/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample2/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample2/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample20/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample20/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample21/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample21/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample22/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample22/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample23/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample23/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample24/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample24/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample25/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample25/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample3/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample3/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample4/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample4/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample4/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample4/2.jpg -------------------------------------------------------------------------------- /invoice_images/Sample4/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample4/3.jpg -------------------------------------------------------------------------------- /invoice_images/Sample5/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample5/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample6/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample6/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample7/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample7/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample8/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample8/1.jpg -------------------------------------------------------------------------------- /invoice_images/Sample9/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Sample9/1.jpg -------------------------------------------------------------------------------- /invoice_images/X_lines.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/X_lines.jpg -------------------------------------------------------------------------------- /invoice_images/Y_lines.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/Y_lines.jpg -------------------------------------------------------------------------------- /invoice_images/dilated.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/dilated.jpg -------------------------------------------------------------------------------- /invoice_images/graph.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/graph.jpg -------------------------------------------------------------------------------- /invoice_images/lines_removed.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/lines_removed.jpg -------------------------------------------------------------------------------- /invoice_images/output.csv: -------------------------------------------------------------------------------- 1 | GST REG NO,17ABCDEFI2Z3GXYZ 2 | REF NO / DATE,PO1234567 / 1.7.2017 3 | GST INV NO/ DATE,11416603306 / 03.01.2018 4 | ORDER NO/ DATE,1234567 / 1.7.2017 5 | Z030 30 DAYS FROM INVOICE DATE,02.02.201811420) 6 | RECIEVER COMPANYIND,"111000, India09-Uttar PradeshO7AABBC8888G1L AZTABBCC9098G" 7 | RECIEVER COMPANYLUD,"111000, India09-Uttar PradeshO7AABBC8888G1AZI1" 8 | GSTIN/UNIQUE ID,PAN 9 | AMT,"17,908.565,014.442,060.1024,983.10" 10 | TOTAL INVOICE VALUE ( IN FIGURE ),"163,778.00" 11 | TOTAL INVOICE VALUE (IN WORDS ),ONE T.AKH SIXTYNo Of packaces : iff 12 | PO NUMHER , PO1234567 13 | 14 | 15 | MATERIAL,DESCRIPTION,HSN,QTY,UNIT PRICE,TOTAL,DISCOUNT,TAXABLE VALUE,IGST,,,,,, 16 | 512167,Dummy product for evaluation,75647000,12.066,"6,291.00","99,492.00",0.00,"99,492.00",iB,"17,908.56" 17 | 512168,Dummy product for evaluation,75647000,3.000,"9,286.00","27,858 00",o.oc,"27,858.90",18,"5,014.44" 18 | §12425,Dummy product for evaluation,73647000,3.000,"3,815.00","11,445.00",0.00,"12,445.00",18,"2,060.10" 19 | FREIGHT,,,,,,,,, 20 | INSURANCE,,,,,,,,, 21 | PACKING & FORWARDING CHARGES,,,,,,,,, 22 | TOTALS,,,19.000,,"138,795.00",0.00,"138,795.00",,"24,983.10" 23 | ,,,,,,,,, 24 | ,,,,,,,,, 25 | ,,,,,,,,, 26 | ,,,,,,,,,AMT 27 | -------------------------------------------------------------------------------- /invoice_images/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/output.png -------------------------------------------------------------------------------- /invoice_images/sample_output_original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piyushmathur17/invoice-extractor/6cf37789952df5bd94b4e444bf0a0d9a18e17e69/invoice_images/sample_output_original.png -------------------------------------------------------------------------------- /label_synonyms.csv: -------------------------------------------------------------------------------- 1 | 0,Supplier Address,Seller Address,Seller Addr.,Supplier Addr.,Seller's Address,Supplier's Address,supplier address,supplier add.,,,, 2 | 1,Seller GST No.,Seller GSTIN,Seller GSTN No.,Seller GST Number,Supplier GST No.,Supplier GSTIN,Supplier GSTN No.,Supplier GST Number,vendor gstin,vendor gstin/unique,, 3 | 2,Country of Origin,Source of origin,Shipping from,Shipping Place,Sent from,Ship from,,,,,, 4 | 3,Currency,INR,,,,,,,,,, 5 | 4,Description,Desc.,Additional Info,Invoice Info.,,,,,,,, 6 | 5,Total Invoice amount entered by WH operator,Total Amount,Total Amt.,Total Price,Tota,grand total,invoice amt,invoice amount,,,, 7 | 6,Total Invoice Quantity entered by WH operator,qty,quantity,total quantity,No of items,total items,,,,,, 8 | 7,Total TCS Collected,tax,tcs,tcs collected,,,,,,,, 9 | 8,Round Off Charges,round off,round off amount,rounded amount,,,,,,,, 10 | 9,PO Number,po number,purchase number,po,purchase order number,order no,order number,order no.,customer po no,curomer po number,, 11 | 10,Invoice Items Total Amount,Total Amount,Total Amt.,Total Price,Tota,,,,,,, 12 | 11,Invoice Items total quantity,qty,quantity,total quantity,No of items,,,,,,, 13 | 12,Buyer GSTIN Number,Buyer GSTIN Number,buyer gstin,gstin,gstn no.,gstn no,,,,,, 14 | 13,Ship to Address,ship to address,shipping address,delivery address,state,ship-to,,,,,, 15 | 14,S.No,S no,serial,serial no,serial number,serial no.,Sl,Sl. No,Sl no,,, 16 | 15,Product ID,product id,item id,title,description,description of goods,item description,,,,, 17 | 16,HSN,hsn,hsn/sac,sac,,,,,,,, 18 | 17,Title,,,,,,,,,,, 19 | 18,Quantity,qty,quantity,number of items,qty.,,,,,,, 20 | 19,Unit Price,price,item price,basic selling price,selling price,selling price(INR),,,,,, 21 | 20,Excise Duty,,,,,,,,,,, 22 | 21,Discount Percent,disc,discount%,discount %,off %,disc %,discount,discount percent,discou,,, 23 | 22,SGST Percent,SGST%,SGST,SGST Percent,SGST rate,,,,,,, 24 | 23,CGST Percent,CGST,CGST%,CGST Percent,SGST rate,,,,,,, 25 | 24,IGST Percent,IGST Percent,IGST Percent,IGST Percent,IGST rate,,,,,,, 26 | 25,Cess Percent,cess charge,cess,cess%,cess percent,cess %,cess rate,,,,, 27 | 26,TCS Percent,tcs charge,tcs,tcs%,tcs percent,tcs %,tcs rate,,,,, 28 | 27,Total Amount,Total Amount,Total Amt.,Total Price,Tota,grand total,total amt,invoice amount,invoice amt,payable amt,net payable,net amount 29 | 28,APP %,APP,APP %,APP%,,,,,,,, 30 | -------------------------------------------------------------------------------- /labels.csv: -------------------------------------------------------------------------------- 1 | Seller Address,0 2 | Seller GSTIN Number,1 3 | Country of Origin,2 4 | Currency,3 5 | Description,4 6 | Total Invoice amount entered by WH operator,5 7 | Total Invoice Quantity entered by WH operator,6 8 | Total TCS Collected,7 9 | Round Off Charges,8 10 | PO Number,9 11 | Invoice Items Total Amount,10 12 | Invoice Items total quantity,11 13 | Buyer GSTIN Number,12 14 | Ship to Address,13 15 | S.No,14 16 | Product ID,15 17 | HSN,16 18 | Title,17 19 | Quantity,18 20 | Unit Price,19 21 | Excise Duty,20 22 | Discount Percent,21 23 | SGST Percent,22 24 | CGST Percent,23 25 | IGST Percent,24 26 | Cess Percent,25 27 | TCS Percent,26 28 | Total Amount,27 29 | APP %,28 30 | -------------------------------------------------------------------------------- /merge_boxes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | 5 | #a utility function to assign each word a row 6 | def make_rows(contours, thresh_y = 0.6): 7 | contoursBBS = {} 8 | height_list=[] 9 | for contour in contours: 10 | [x, y, w, h] = cv2.boundingRect(contour) 11 | height_list.append(h) 12 | height_list.sort() 13 | #contours with height less than min_height will be discarded 14 | min_height = height_list[int(len(height_list)/2)]*0.6 15 | print("min_height: ",min_height) 16 | #finding suitable line height 17 | alpha = int(len(height_list)*0.3) 18 | line_height = 1.2*sum(height_list[alpha:len(height_list)-alpha])/(len(height_list)-2*alpha) 19 | 20 | for contour in contours: 21 | [x, y, w, h] = cv2.boundingRect(contour) 22 | if h< min_height : continue 23 | cnt= [x,y,w,h] 24 | search_key= y 25 | #check if current contour is part of any existing row 26 | if contoursBBS: 27 | text_row = min(contoursBBS.keys(), key = lambda key: abs(key-search_key)) 28 | #if diff btw nearest row and y is greater than the threshhold 29 | #if(abs(text_row-y) > h*thresh_y): 30 | if(abs(text_row-y) > line_height): 31 | contoursBBS[y]=[] 32 | contoursBBS[y].append(cnt) 33 | else : contoursBBS[text_row].append(cnt) 34 | #else make new row 35 | else: contoursBBS[y]=[cnt] 36 | 37 | #sort contours 38 | for row in contoursBBS: 39 | contoursBBS[row].sort(key = lambda x: x[0]) 40 | 41 | return contoursBBS 42 | 43 | def detect_line(rect,x1,x2,y1,y2,w1,w2,h1,h2): 44 | x1=x1+w1+1 45 | y=int((y1+h1)/2 + (y2+h2)/2) 46 | pos_edge=0 47 | neg_edge=0 48 | for i in range(x1,x2): 49 | if (int(rect[y][i][0])+int(rect[y][i][1])+int(rect[y][i][2]) - int(rect[y][i-2][0])-int(rect[y][i-2][1])-int(rect[y][i-2][2]))/2 >= 80 : pos_edge=1 50 | if (int(rect[y][i][0])+int(rect[y][i][1])+int(rect[y][i][2]) - int(rect[y][i-2][0])-int(rect[y][i-2][1])-int(rect[y][i-2][2]) ) /2 <= -80 : neg_edge=1 51 | if(pos_edge and neg_edge): 52 | print("line detected between ",x1+w1," ",x2) 53 | return True 54 | return False 55 | #a utility function to merge two words based on their nearness 56 | def merge_boxes(rect, contoursBBS, thresh_x = 0.3, thresh_y = 0.3): 57 | merge_cnt={} 58 | i=0 59 | for key in contoursBBS: 60 | j=1 61 | i=0 62 | de=[] 63 | merge_cnt[key]=[] 64 | [x1,y1,w1,h1]=contoursBBS[key][i] 65 | new_width = w1 66 | new_height = h1 67 | miny=y1 68 | #iterating through row to see if current contour can be merged with previous 69 | while j< len(contoursBBS[key]): 70 | 71 | [x2,y2,w2,h2]=contoursBBS[key][j] 72 | if( abs(y1-y2)=1 and levenshtein_ratio_and_distance(k,tex)>0.8 ): 155 | rect = cv2.rectangle(rect, (x, y), (x + w, y + h), (0, 0, 255), 1) 156 | key_nodes.append(node_number-1) 157 | # if(':' in text): 158 | # key, val = text.split(':',1) 159 | # break 160 | break 161 | text_val[node_number-1] = text 162 | end2 = time.time() 163 | tesstime += end2-end 164 | # Appending the text into file 165 | if text!="": 166 | if write_: file.write(text) 167 | file_.write(text) 168 | if write_: file_.write("\n") 169 | file.write("\n") 170 | 171 | tt = time.time()- tt 172 | print("croptime: ",croptime, " tesstime: ",tesstime," tt: ",tt) 173 | # Close the file 174 | file.close 175 | cv2.imwrite(save_dir+'boxed_'+file_name, rect) 176 | 177 | 178 | make_graph(rect,merge_cnt,key_nodes,column_contours, text_val, synonyms, labels) 179 | #cv2.imshow('ho hey',rect) 180 | #cv2.waitKey(0) 181 | #cv2.destroyAllWindows() 182 | 183 | def main(): 184 | 185 | path = 'F:/Flipkart/New folder/helli/invoice-extractor/invoice_images/' 186 | 187 | folders = listdir(path) 188 | #print(pdfs) 189 | for folder in folders: 190 | dir_path = path + folder + "/" 191 | images = listdir(dir_path) 192 | if(folder!="Sample3"): continue 193 | for image in images: 194 | if len(image.split('.')[0])>1:continue 195 | file_name = image 196 | #file_path = dir_path 197 | #mkdir(target) 198 | get_text(dir_path,file_name,write_ = False) 199 | 200 | if __name__ == '__main__': 201 | main() 202 | --------------------------------------------------------------------------------