├── code ├── Readme ├── renderer.py ├── FigCap.py ├── pdf_info.py └── xpdf_process.py ├── data └── Readme ├── README.md └── LICENSE /code/Readme: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/Readme: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /code/renderer.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import os, sys, re 3 | from PIL import Image 4 | import numpy as np 5 | import tempfile 6 | 7 | #output_dpi = str(72) 8 | 9 | 10 | def render_pdf(filename, customize_dpi): 11 | """ 12 | This function renders the document unsing imagemagick and returns a list of images, one for each page. 13 | The images are PIL Image type. 14 | """ 15 | output_dpi = str(customize_dpi) 16 | sep = os.path.sep 17 | # splitted = filename.split(sep) 18 | # t = splitted[len(splitted) - 1] 19 | # fname = t.split('.')[0] 20 | # currDir = os.getcwd() 21 | # outputDir = currDir + sep + fname + sep 22 | # os.mkdir(outputDir, 0755) 23 | outputDir = tempfile.mkdtemp() 24 | 25 | 26 | rasterScale = 3 # increase this if you want higher resolution images 27 | rasterDensity = str(rasterScale * 100) 28 | 29 | # If you have your path setup correctly in 'nix this should work, 30 | # right now its set up to have explicit path in windows 31 | if os.name == 'nt': 32 | imagemagickPath = '/usr/pengyuan/others/ImageMagick-7.0.3-5-portable-Q16-x86/convert.exe' 33 | os.system( 34 | imagemagickPath + ' -density ' + rasterDensity + ' -resample ' + output_dpi + ' -set colorspace RGB ' + 35 | filename + ' ' + os.path.join(outputDir, 'image.png')) 36 | else: 37 | os.system( 38 | # 'convert -density ' + rasterDensity + ' -resample ' + output_dpi + ' -set colorspace RGB ' + filename + ' ' + outputDir + 'image.png') 39 | #'convert -density ' + output_dpi + ' -resample ' + output_dpi + ' -set colorspace RGB ' + filename + ' ' + outputDir + 'image.png') 40 | 'gs -q -sDEVICE=png16m -o ' + os.path.join(outputDir, 'file-%02d.png') + ' -r' + output_dpi + ' ' + filename) 41 | 42 | files = [f for f in os.listdir(outputDir) if os.path.isfile(os.path.join(outputDir, f)) and not f.startswith('.')] 43 | files = natural_sort(files) 44 | images = [] 45 | for f in files: 46 | if f.endswith('.png'): 47 | pageIm = Image.open(os.path.join(outputDir, f)).convert('RGB') 48 | pageIm.load() # load into memory (also closes the file associated) 49 | images.append(pageIm) 50 | shutil.rmtree(outputDir) 51 | return images 52 | 53 | 54 | def natural_sort(l): # this is taken from stack overflow. 55 | """ 56 | This function will sort strings with numeric values in natural ascending order, 57 | such that it does not go 1,11,2 etc. 58 | """ 59 | convert = lambda text: int(text) if text.isdigit() else text.lower() 60 | alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] 61 | return sorted(l, key=alphanum_key) 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PDFigCapX 2 | Website: 3 | https://www.eecis.udel.edu/~compbio/PDFigCapX 4 | Our paper at: 5 | https://doi.org/10.1093/bioinformatics/btz228 6 | 7 | ## Codes 8 | **Main function:** 9 | Command: python /code/FigCap.py 10 | Inputs: 11 | *input_path*: The folder contains PDF files that need to be parsed. 12 | *output_path*: The folder where parsing results will be saved. 13 | Outputs: 14 | For each document in the *input_path*, the main function will generate a corresponding folder with the same name as the original document in the *output_path*. All extracted figures (in jpg format), captions (in text format) and their coordinate information (in json format) will be saved in the corresponding folder. 15 | 16 | **Explainations of submodules called by FigCap.py** 17 | **Submodule 1:** PDF parsing using xpdf (Lines 51-57 in FigCap.py) 18 | Inputs: 19 | *Xpdf_path*: The path to the xpdf code. In our implementation, the path is set to /usa/pengyuan/Documents/RESEARCH/xpdf-tools-linux-4.00/bin/pdftohtml. This code is also available online at: https://www.xpdfreader.com/pdftohtml-man.html. 20 | *pdf_path*: The document that needs to be parsed. 21 | *html_file_path*: The folder contains all parsing results obtained by using xpdf. 22 | Outputs: 23 | A folder with the same name as the PDF document will be created in the *html_file_path*. For each page in the document, an HTML file containing the textual information and an text-striped image will be saved in the newly created folder. 24 | 25 | **Submodule 2:** Obtain layout information (/code/pdf_info.py) 26 | Inputs: 27 | *pdf_path*: The path to the PDF file. 28 | *html_file_path: The path to the folder generated by xpdf in Submodule 1. 29 | Outputs: 30 | A json file containing all layout information and textual information is saved in *html_file_path* and this information will be passed to Submodule 3. 31 | 32 | **Submodule 3:** Disambiguation of figures and captions (/code/xpdf_process.py) 33 | Inputs: 34 | *pdf_path:* The path to the PDF file. 35 | *html_file_path: The path to the folder generated by xpdf in Submodule 1. 36 | Outputs: 37 | A python dictionary that contains the coordinate information of all identified figures and captions. 38 | 39 | **Submodule 4:** Extract figures and captions as files (Lines 78-120 in FigCap.py) 40 | Inputs: 41 | *data[pdf]*: The python dictionary that contains the coordinates of all identified figures and captions. The variable *pdf* is the ID of the PDF file. 42 | *Output_path_file*: The path that figures and captions will be saved. 43 | Outputs: 44 | Figures (in jpg format), captions (in text format) and their coordinate information (in a json format) will be saved in the *output_path_folder* folder. 45 | 46 | ## Datasets 47 | ### GXD-200 dataset: 48 | Dataset path: /datasets/GXD200 49 | Description: The GXD-200 dataset contains 200 documents selected from a collection curated by Jackson Lab's Gene Expression Database. There are 1335 figures, 1298 figure-associated captions, and 1298 figure and caption pairs in this dataset. 50 | Ground-truth annotations: /datasets/GXD200/GT_GXD200.json 51 | 52 | ### PMD-200 dataset: 53 | Dataset path: /datasets/PMC200 54 | Description: The PMC-200 dataset contains 200 biomedical documents selected from the PubMed Central (PMC) Open Access Subset (2018). There are 1042 figures, 1032 captions, and 1032 figure and caption pairs in this dataset. 55 | Ground-truth annotations: /datasets/PMC200/GT_PMC200.json 56 | -------------------------------------------------------------------------------- /code/FigCap.py: -------------------------------------------------------------------------------- 1 | """ 2 | main code page 3 | structure (xpdf_process): 4 | 1. Read pdfs from input folder 5 | 2. Figure and caption pair detection 6 | 2.1. graphical content detection 7 | 2.2 page segmentation 8 | 2.3 figure detetion 9 | 2.4 caption association 10 | 11 | 3. Mess up pdf processing 12 | 13 | 14 | Writen by Pengyuan Li 15 | 16 | Start from 19/10/2017 17 | 1.0 version 28/02/2018 18 | 19 | """ 20 | 21 | import os 22 | import json 23 | from pprint import pprint 24 | import renderer 25 | import matplotlib.pyplot as plt 26 | import matplotlib.patches as patches 27 | from xpdf_process import figures_captions_list 28 | import subprocess 29 | import os 30 | import time 31 | 32 | if __name__ == "__main__": 33 | 34 | input_path = '/eecis/shatkay/homes/pengyuan/Documents/RESEARCH/PDFigCapX/code/sample_data_for_Juan' 35 | output_path = '/eecis/shatkay/homes/pengyuan/Documents/RESEARCH/PDFigCapX/code/sample_data_for_Juan' 36 | xpdf_path = output_path +'/xpdf/' 37 | log_file = output_path + '/log.text' 38 | f_log = open(log_file, 'w') 39 | if not os.path.isdir(xpdf_path): 40 | os.mkdir(xpdf_path) 41 | # Read each files in the input path 42 | for pdf in os.listdir(input_path): 43 | if pdf.endswith('.pdf') and (not pdf.startswith('._')): 44 | data = {} 45 | print input_path+pdf 46 | images = renderer.render_pdf(input_path + '/' + pdf) 47 | data[pdf] = {} 48 | data[pdf]['figures'] = [] 49 | data[pdf]['pages_annotated'] = [] 50 | pdf_flag = 0 51 | try: 52 | if not os.path.isdir(xpdf_path+pdf[:-4]): 53 | std_out = subprocess.check_output(["/usa/pengyuan/Documents/RESEARCH/PDFigCapX/xpdf-tools-linux-4.00/bin64/pdftohtml", input_path+'/'+pdf, xpdf_path+pdf[:-4]+'/']) 54 | except: 55 | print "\nWrong "+pdf+"\n" 56 | f_log.write(pdf+'\n') 57 | pdf_flag = 1 58 | 59 | if pdf_flag == 0: 60 | flag = 0 61 | wrong_count = 0 62 | while flag==0 and wrong_count<5: 63 | try: 64 | figures, info = figures_captions_list(input_path, pdf, xpdf_path) 65 | flag = 1 66 | 67 | except: 68 | wrong_count = wrong_count +1 69 | time.sleep(5) 70 | print pdf 71 | info['fig_no_est']=0 72 | figures = [] 73 | print "------\nChrome Error\n----------\n" 74 | 75 | data[pdf]['fig_no'] = info['fig_no_est'] 76 | 77 | output_file_path = output_path +'/' + pdf[:-4] 78 | if not os.path.isdir(output_file_path): 79 | os.mkdir(output_file_path) 80 | 81 | for figure in figures: 82 | page_no = int(figure[:-4][4:]) 83 | page_fig= images[page_no -1] 84 | rendered_size = page_fig.size 85 | 86 | bboxes = figures[figure] 87 | order_no = 0 88 | for bbox in bboxes: 89 | order_no = order_no + 1 90 | png_ratio = float(rendered_size[1])/info['page_height'] 91 | print(png_ratio) 92 | 93 | if len(bbox[1])>0: 94 | data[pdf]['figures'].append({'page': page_no, 95 | 'region_bb': bbox[0], 96 | 'figure_type': 'Figure', 97 | 'page_width': info['page_width'], 98 | 'page_height': info['page_height'], 99 | 'caption_bb': bbox[1][0], 100 | 'caption_text': bbox[1][1] 101 | }) 102 | with open(output_file_path+'/'+str(page_no)+'_'+str(order_no)+'.txt', 'w') as capoutput: 103 | capoutput.write(str(bbox[1][1])) 104 | capoutput.close 105 | else: 106 | data[pdf]['figures'].append({'page': page_no, 107 | 'region_bb': bbox[0], 108 | 'figure_type': 'Figure', 109 | 'page_width': info['page_width'], 110 | 'page_height': info['page_height'], 111 | 'caption_bb': [], 112 | 'caption_text': [] 113 | }) 114 | fig_extracted = page_fig.crop([int(bbox[0][0]*png_ratio), int(bbox[0][1]*png_ratio), 115 | int((bbox[0][0]+bbox[0][2])*png_ratio), int((bbox[0][1]+bbox[0][3])*png_ratio)]) 116 | fig_extracted.save(output_file_path+'/'+str(page_no)+'_'+str(order_no)+'.jpg') 117 | 118 | pprint(data) 119 | json_file = output_file_path+'/'+ pdf[:-4]+'.json' 120 | with open(json_file, 'w') as outfile: 121 | json.dump(data, outfile) 122 | 123 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /code/pdf_info.py: -------------------------------------------------------------------------------- 1 | ''' 2 | pdf_info is to get the basic infomation from pdfs 3 | info={ 4 | filename, height, width, page_no, figure_est_no, layout_bbox, text_mask 5 | } 6 | ''' 7 | from selenium import webdriver 8 | from multiprocessing import Pool, TimeoutError 9 | import time 10 | import os 11 | import json 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | import sys 15 | import cv2 16 | 17 | # Column width, middle gap, Maximum Figure number will be helpful 18 | def pdf_info(html_file_path, pdf): 19 | # Get the pdf info by parsing html 20 | 21 | info = {} 22 | # obtain file name 23 | info['filename'] = pdf 24 | # obtain page no 25 | for_counting = [] 26 | for page in os.listdir(html_file_path): 27 | if page.endswith('.png') & page.startswith('page'): 28 | for_counting.append(page) 29 | page_no = len(for_counting) 30 | for_counting = sorted(for_counting) 31 | info['page_no'] = page_no 32 | # Obtain all html information 33 | list_of_htmls = [] 34 | html_info = [] 35 | html_info_json = html_file_path+'/' + pdf[:-4] + '.json' 36 | if os.path.isfile(html_info_json): 37 | with open(html_info_json) as json_data: 38 | html_info = json.load(json_data) 39 | else: 40 | browser = webdriver.Chrome('/usa/pengyuan/Documents/RESEARCH/PDFigCapX/chromedriver/chromedriver') 41 | for page_id in range(page_no): 42 | page = for_counting[page_id] 43 | html_file = 'file://' + html_file_path + '/' + page[:-4] + '.html' 44 | list_of_htmls.append(html_file) 45 | browser.get(html_file) 46 | page_layout = browser.find_element_by_xpath("/html/body/img") 47 | img_size = (page_layout.size['height'], page_layout.size['width']) 48 | text_elements = browser.find_elements_by_xpath("/html/body/div") 49 | text_boxes = [] 50 | for element in text_elements: 51 | text = element.text 52 | if len(text) > 0: 53 | text_boxes.append([[element.location['x'], element.location['y'], element.size['width'], element.size['height']], text]) 54 | html_info.append([int(os.path.basename(html_file)[4:-5]), text_boxes, img_size]) 55 | browser.quit() 56 | #html_info.append(read_each_html(html_file)) 57 | with open(html_file_path+'/' + pdf[:-4] + '.json', 'w') as outfile: 58 | json.dump(html_info, outfile) 59 | #multithread = Pool(4) 60 | #html_info = multithread.map(read_each_html, list_of_htmls) 61 | #multithread.close() 62 | #multithread.join() 63 | # obtain text layout 64 | row_width = [] 65 | row_height = [] 66 | column_no = 1 67 | columns = [0] 68 | left_point = [] 69 | top_point = [] 70 | right_point = [] 71 | 72 | if page_no > 3: 73 | list_to_check = range(2, page_no) 74 | else: 75 | list_to_check = range(1, page_no+1) 76 | for each_page_html in html_info: 77 | if each_page_html[0] in list_to_check: 78 | #print each_page_html[0] 79 | # Obtain page convas region 80 | info['page_height'] = each_page_html[2][0] 81 | info['page_width'] = each_page_html[2][1] 82 | for element in each_page_html[1]: 83 | if len(element[1]) > 30: 84 | row_width.append(element[0][2]) 85 | row_height.append(element[0][3]) 86 | left_point.append(element[0][0]) 87 | right_point.append(element[0][0]+element[0][2]) 88 | top_point.append(element[0][1]) 89 | point_left = sorted([(i, left_point.count(i)) for i in set( 90 | left_point)], key=lambda x: x[1], reverse=True) 91 | width_row = sorted([(i, row_width.count(i)) for i in set( 92 | row_width)], key=lambda x: x[1], reverse=True) 93 | height_row = sorted([(i, row_height.count(i)) for i in set( 94 | row_height)], key=lambda x: x[1], reverse=True) 95 | info['row_height'] = height_row[0][0] 96 | info['row_width'] = width_row[0][0] 97 | info['text_layout'] = (max(0, min(top_point)), 98 | min(info['page_height'],max(top_point))) 99 | 100 | # Compute column no and position for each column 101 | i = 0 102 | while i < len(point_left): 103 | j = i + 1 104 | while j < len(point_left): 105 | if abs(point_left[i][0] - point_left[j][0]) <= 10: 106 | point_left[i] = (point_left[i][0], point_left[i][1] + 107 | point_left[j][1]) 108 | del point_left[j] 109 | else: 110 | j = j + 1 111 | i = i + 1 112 | point_left = sorted(point_left, key=lambda x: x[1], reverse=True) 113 | 114 | if float(point_left[0][1]) / len(left_point) > 0.75 \ 115 | or float(info['row_width']) / info['page_width'] > 0.5: 116 | column_no = 1 117 | columns = [point_left[0][0]] 118 | else: # float(point_left[1][1]) / len(left_point) > 0.2: # Need to 119 | # correct, it may cause numbe below 0 120 | column_no = 2 # int(float((info['page_width'] - 2*point_left[0][0]))/info['row_width']) 121 | 122 | for i in range(1, len(point_left)): 123 | if abs(point_left[i][0] - point_left[0][0]) > info['row_width']: 124 | columns = [min(point_left[i][0], point_left[0][0]), 125 | max(point_left[i][0], point_left[0][0])] 126 | break 127 | 128 | info['column_no'] = column_no 129 | info['columns'] = columns 130 | 131 | left_bar = min(left_point) 132 | right_bar = max(right_point) 133 | # pdf layout 134 | if left_bar > 0 and left_bar < 20 * info['row_height']: 135 | info['left_bbox'] = [0, 0, left_bar, info['page_height']] 136 | info['right_bbox'] = [min(info['page_width'] - 2 * info['row_height'],right_bar), 137 | 0, info['page_width'] - min(info['page_width'] - 2 * info['row_height'],right_bar), info['page_height']] 138 | if info['text_layout'][0] < 15 * info['row_height'] and info['text_layout'][1] > 15 * info['row_height']: 139 | info['top_bbox'] = [0, 0, info['page_width'], info['text_layout'][0]] 140 | info['down_bbox'] = [0, info['text_layout'][1], info['page_width'], 141 | info['page_height'] - info['text_layout'][1]] 142 | else: 143 | info['top_bbox'] = [0, 0, info['page_width'], info['row_height']] 144 | info['down_bbox'] = [0, info['page_height'] - info['row_height'], info['page_width'], 145 | info['row_height']] 146 | else: 147 | info['left_bbox'] = [0, 0, info['row_height'], info['page_height']] 148 | info['right_bbox'] = [info['page_width'] - info['row_height'], 0, info['row_height'], info['page_height']] 149 | info['top_bbox'] = [0, 0, info['page_width'], info['row_height']] 150 | info['down_bbox'] = [0, info['page_height'] - info['row_height'], info['page_width'], info['row_height']] 151 | 152 | #print info['left_bbox'] 153 | #print info['right_bbox'] 154 | #print info['top_bbox'] 155 | #print info['down_bbox'] 156 | info['mess_up'] = False 157 | info['graph_layout'] = info['text_layout'] 158 | 159 | return info, html_info 160 | ''' 161 | 162 | 163 | 164 | page_layout = browser.find_element_by_xpath("/html/body/img") 165 | info['page_height'] = page_layout.size['height'] 166 | info['page_width'] = page_layout.size['width'] 167 | 168 | text_elements = browser.find_elements_by_xpath("/html/body/div") 169 | for element in text_elements: 170 | if len(element.text) > 30: 171 | row_width.append(element.size['width']) 172 | row_height.append(element.size['height']) 173 | left_point.append(element.location['x']) 174 | top_point.append(element.location['y']) 175 | 176 | point_left = sorted([(i, left_point.count(i)) for i in set( 177 | left_point)], key=lambda x: x[1], reverse=True) 178 | width_row = sorted([(i, row_width.count(i)) for i in set( 179 | row_width)], key=lambda x: x[1], reverse=True) 180 | height_row = sorted([(i, row_height.count(i)) for i in set( 181 | row_height)], key=lambda x: x[1], reverse=True) 182 | info['row_height'] = height_row[0][0] 183 | info['row_width'] = width_row[0][0] 184 | info['text_layout'] = (max(0, min(top_point)), 185 | min(info['page_height'], 186 | max(top_point))) 187 | # Compute column no and position for each column 188 | i = 0 189 | while i < len(point_left): 190 | j = i + 1 191 | while j < len(point_left): 192 | if abs(point_left[i][0] - point_left[j][0]) <= 10: 193 | point_left[i] = (point_left[i][0], point_left[i][1] + 194 | point_left[j][1]) 195 | del point_left[j] 196 | else: 197 | j = j + 1 198 | i = i + 1 199 | point_left = sorted(point_left, key=lambda x: x[1], reverse=True) 200 | 201 | if float(point_left[0][1]) / len(left_point) > 0.75\ 202 | or float(info['row_width'])/info['page_width'] > 0.5: 203 | column_no = 1 204 | columns = [point_left[0][0]] 205 | else: # float(point_left[1][1]) / len(left_point) > 0.2: # Need to 206 | # correct, it may cause numbe below 0 207 | column_no = 2 #int(float((info['page_width'] - 2*point_left[0][0]))/info['row_width']) 208 | 209 | for i in range(1, len(point_left)): 210 | if abs(point_left[i][0] - point_left[0][0]) > info['row_width']: 211 | columns = [min(point_left[i][0], point_left[0][0]), 212 | max(point_left[i][0], point_left[0][0])] 213 | break 214 | 215 | info['column_no'] = column_no 216 | info['columns'] = columns 217 | 218 | left_bar = min(left_point) 219 | # pdf layout 220 | if left_bar >0 and left_bar < 20*info['row_height']: 221 | info['left_bbox'] = [0, 0, left_bar, info['page_height']] 222 | info['right_bbox'] = [info['page_width']-2*info['row_height'], 223 | 0, 2*info['row_height'], info['page_height']] 224 | if info['text_layout'][0] < 15*info['row_height'] and info['text_layout'][1] > 15*info['row_height']: 225 | info['top_bbox'] = [0, 0, info['page_width'], info['text_layout'][0]] 226 | info['down_bbox'] = [0, info['text_layout'][1], info['page_width'], info['page_height']-info['text_layout'][1]] 227 | else: 228 | info['top_bbox'] = [0, 0, info['page_width'], info['row_height']] 229 | info['down_bbox'] = [0, info['page_height']-info['row_height'], info['page_width'], 230 | info['row_height']] 231 | else: 232 | info['left_bbox'] = [0, 0, info['row_height'], info['page_height']] 233 | info['right_bbox'] = [0, info['page_width'] - info['row_height'], info['row_height'], info['page_height']] 234 | info['top_bbox'] = [0, 0, info['page_width'], info['row_height']] 235 | info['down_bbox'] = [0, info['page_height']-info['row_height'], info['page_width'], info['row_height']] 236 | 237 | print info['left_bbox'] 238 | print info['right_bbox'] 239 | print info['top_bbox'] 240 | print info['down_bbox'] 241 | # graph layout 242 | # 243 | # if page_no >1: 244 | # previous_page = for_counting[list_to_check[0]] 245 | # previous_img = cv2.imread(html_file_path + '/' + previous_page) 246 | # previous_img = previous_img <240 247 | # 248 | # for page_id in list_to_check[1:]: 249 | # page = for_counting[page_id] 250 | # img = cv2.imread(html_file_path + '/' + page) 251 | # img = img <240 252 | # result = img & previous_img 253 | # temp_result = result[:, :, 0] 254 | # previous_img = result 255 | # # xor pages to find the top/bottom line 256 | # sum_result = [ sum(each_row) for each_row in temp_result] 257 | # sum_result = [i for i in range(len(sum_result)) if sum_result[i] > 0] 258 | # top_point = min(sum_result) 259 | # bottom_point = max(sum_result) 260 | # 261 | # info['graph_layout'] = info['text_layout'] 262 | # else:ue 263 | # info['graph_layout'] = info['text_layout']#(top, down) 264 | info['mess_up'] = False 265 | info['graph_layout'] = info['text_layout'] 266 | ''' 267 | 268 | 269 | 270 | def read_each_html(x): 271 | #browser = webdriver.Chrome('/home/pengyuan/chromedriver') 272 | #browser = webdriver.Chrome('/usa/pengyuan/Documents/RESEARCH/PDFigCapX/chromedriver/chromedriver') 273 | #browser.implicitly_wait(2) 274 | browser.get(x) 275 | page_layout = browser.find_element_by_xpath("/html/body/img") 276 | img_size = (page_layout.size['height'], page_layout.size['width']) 277 | text_elements = browser.find_elements_by_xpath("/html/body/div") 278 | text_boxes = [] 279 | for element in text_elements: 280 | text = element.text 281 | if len(text) > 0: 282 | text_boxes.append([[element.location['x'], element.location['y'], element.size['width'], element.size['height']], text]) 283 | 284 | browser.quit() 285 | return int(os.path.basename(x)[4:-5]), text_boxes, img_size 286 | -------------------------------------------------------------------------------- /code/xpdf_process.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The main code for figure and caption extraction (figures_captions_list) 3 | 1. Read pdfs from input folder (pdf_info) 4 | 2. Figure and caption pair detection 5 | 6 | 2.1. graphical content detection 7 | 2.2 page segmentation 8 | 2.3 figure detetion 9 | 2.4 caption association 10 | 11 | 3. Mess up pdf processing 12 | 13 | ''' 14 | 15 | import subprocess 16 | import os 17 | import numpy as np 18 | import matplotlib.pyplot as plt 19 | import sys 20 | import cv2 21 | import codecs 22 | import matplotlib.patches as patches 23 | import scipy.misc 24 | import re 25 | from lxml import etree 26 | from selenium import webdriver 27 | from pdf_info import pdf_info 28 | 29 | 30 | 31 | def figures_captions_list(input_path, pdf, output_path): 32 | # input: single pdf file 33 | # output: bounding box list of figures and captions 34 | pdf_filename = input_path + pdf 35 | html_file_path = output_path + pdf[:-4] 36 | # 1. Read pdfs from input folder (pdf_info) 37 | info, html_boxes = pdf_info(html_file_path, pdf) 38 | # 2.1. graphical content detection 39 | cap_box, fig_box, info, table_box, text_box = box_detection(html_file_path, info, html_boxes) 40 | pre_figures, cap_regions = fig_cap_matching(cap_box, fig_box, info, table_box, text_box) 41 | figures, captions = evaluation(pre_figures, cap_regions, html_file_path, info, html_boxes) # Remove figure_table and figure caption in one box 42 | figures, captions = check_region(info, figures, captions) 43 | no_of_figures = sum([len(figures[x]) for x in figures]) 44 | no_of_caps = sum([len(cap_box[x]) for x in cap_box]) 45 | no_of_figs = sum([len(fig_box[x]) for x in fig_box]) 46 | #print info['filename'] 47 | #print info['mess_up'] 48 | #print info['fig_no_est'] 49 | 50 | # 51 | # print no_of_figures 52 | # if no_of_figures == no_of_caps: 53 | # figures, cap_regions = same_no_caps_est(cap_box, fig_box, info, table_box, text_box) 54 | # 55 | r = info['png_ratio'] 56 | # plt.close("all") 57 | # for i in range(info['page_no']): 58 | # page = 'page' + str(i + 1) + '.png' 59 | # img = cv2.imread(html_file_path + '/' + page) 60 | # fig, ax = plt.subplots(1) 61 | # ax.imshow(img) 62 | # for each_caption in cap_box[page]: 63 | # rect = patches.Rectangle((each_caption[0]*r, each_caption[1]*r), each_caption[2]*r, each_caption[3]*r, 64 | # linewidth=1, edgecolor='g', 65 | # facecolor='none') 66 | # ax.add_patch(rect) 67 | # 68 | # for each_fig in fig_box[page]: 69 | # #each_fig = each_fig[0] 70 | # rect = patches.Rectangle((each_fig[0]*r, each_fig[1]*r), each_fig[2]*r, each_fig[3]*r, 71 | # linewidth=2, edgecolor='b', 72 | # facecolor='none') 73 | # ax.add_patch(rect) 74 | # for each_cap_region in cap_regions[page]: 75 | # rect = patches.Rectangle((each_cap_region[1][0]*r, each_cap_region[1][1]*r), each_cap_region[1][2]*r, each_cap_region[1][3]*r, 76 | # linewidth=1, edgecolor='y', 77 | # facecolor='none') 78 | # ax.add_patch(rect) 79 | # for each_result in figures[page]: 80 | # each_result = each_result[0] 81 | # rect = patches.Rectangle((each_result[0]*r, each_result[1]*r), each_result[2]*r, each_result[3]*r, 82 | # linewidth=1, edgecolor='r', 83 | # facecolor='none') 84 | # ax.add_patch(rect) 85 | # plt.show() 86 | return figures, info 87 | 88 | 89 | 90 | def box_detection(html_file_path, info, html_boxes): 91 | fig_box = {} 92 | cap_box = {} 93 | word_box = {} 94 | cap_no_clue = [] 95 | table_box={} 96 | #browser = webdriver.Chrome('/home/pengyuan/Documents/FC_extraction/chromedriver') 97 | 98 | for page in sorted(os.listdir(html_file_path)): 99 | if page.endswith('.png') and page.startswith('page'): 100 | 101 | page_no = int(page[4:-4]) 102 | img = cv2.imread(html_file_path + '/' + page) 103 | # plt.imshow(img) 104 | png_size = img.shape 105 | if png_size[0] > png_size[1]: 106 | png_ratio = float(png_size[0]) / info['page_height'] 107 | else: 108 | png_ratio = float(png_size[0]) / info['page_width'] 109 | 110 | # Read each page html find "Fig" 111 | # f = codecs.open(html_file_path + '/' + page[:-4] + '.html', 'r') 112 | # text = f.readline() 113 | # html_file = 'file://' + html_file_path + '/' + page[:-4] + '.html' 114 | # browser.get(html_file) 115 | 116 | text = '' 117 | text_box = [] 118 | page_word_box = [] 119 | table_cap_box = [] 120 | div_no = 1 121 | for page_html in html_boxes: 122 | if page_html[0] == page_no: 123 | text_elements = page_html[1] 124 | 125 | for e in text_elements: 126 | text = e[1] 127 | #if e.size['width'] > info['row_width']-100: 128 | page_word_box.append([max(e[0][0]-info['row_height'], 0) 129 | , e[0][1] 130 | , e[0][2]+2*info['row_height'], 131 | e[0][3]]) 132 | if text.startswith('Table') or text.startswith('table') or text.startswith('Box'): 133 | table_cap_box.append([e[0][0], e[0][1], e[0][2], e[0][3]]) 134 | if text.startswith('Fig') or text.startswith('fig') or text.startswith('FIG'): 135 | #print text 136 | text_box.append([e[0][0], e[0][1], e[0][2], e[0][3]]) 137 | cap_no_clue.append(text) 138 | elif 'Fig' not in text and len(text) > 6: 139 | text = text[:6] 140 | idx1 = text.find('F') 141 | idx2 = text.find('i') 142 | idx3 = text.find('g') 143 | if idx1>= 0 and idx2>=0 and idx3>= 0 and idx2>idx1 and idx3>idx2: 144 | #print text 145 | text_box.append([e[0][0], e[0][1], e[0][2], e[0][3]]) 146 | # rect = patches.Rectangle((e.location['x'] * png_ratio, e.location['y'] * png_ratio), 147 | # e.size['width'] * png_ratio, 148 | # e.size['height'] * png_ratio, 149 | # linewidth=1, edgecolor='b', 150 | # facecolor='none') 151 | # ax.add_patch(rect) 152 | 153 | 154 | cap_box[page] = text_box 155 | table_box[page] = table_cap_box 156 | word_box[page] = page_word_box 157 | 158 | imgray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 159 | ret, thresh = cv2.threshold(imgray, 240, 255, cv2.THRESH_BINARY_INV) 160 | kernel = np.ones((5, 5), np.uint8) 161 | dilation = cv2.dilate(thresh, kernel, iterations=1) 162 | contours, hierarchy = cv2.findContours(dilation,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) 163 | new_thresh = np.zeros(thresh.shape, dtype=np.uint8) 164 | 165 | for cnt in contours: 166 | bbox = cv2.boundingRect(cnt) 167 | p_bbox = [int(float(x) / png_ratio) for x in bbox] 168 | box_image = 0 169 | for caption_box in text_box: 170 | box_image = box_image + overlap_ratio_based(caption_box, p_bbox) 171 | if box_image < 0.5: 172 | cv2.drawContours(new_thresh, [cnt], 0, 255, -1) 173 | 174 | contours, hierarchy = cv2.findContours(new_thresh, 175 | cv2.RETR_EXTERNAL, 176 | cv2.CHAIN_APPROX_SIMPLE) 177 | 178 | # scipy.misc.imsave('thresh.jpg', thresh) 179 | potential_bbox = [] 180 | # fig, ax = plt.subplots(1) 181 | # ax.imshow(img) 182 | for cnt in contours: 183 | bbox = cv2.boundingRect(cnt) 184 | thresh_for_figure = info['row_height'] * png_ratio*1.5#/ 2 modified on 0318 185 | if bbox[3] > thresh_for_figure and bbox[2] > thresh_for_figure: # Important to set, FIg threshold 186 | 187 | p_bbox = [int(float(x) / png_ratio) for x in bbox] 188 | # Format checking, to filter box that at top, down, left or right 189 | ol_left = overlap_ratio_based(p_bbox, info['left_bbox']) 190 | ol_right = overlap_ratio_based(p_bbox, info['right_bbox']) 191 | # Add filter for first page top sign 0110 192 | if page == 'page1.png': 193 | ol_top = overlap_ratio_based(p_bbox, [0, 0, info['page_width'], 194 | info['page_height'] / 4]) # First page box 195 | else: 196 | ol_top = overlap_ratio_based(p_bbox, info['top_bbox']) 197 | 198 | ol_down = overlap_ratio_based(p_bbox, info['down_bbox']) 199 | ol_sum = 0 200 | ol_sum = ol_down + ol_left + ol_right + ol_top 201 | if ol_sum < 0.1: 202 | potential_bbox.append(p_bbox) 203 | # rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2],bbox[3], 204 | # linewidth=1, edgecolor='r', 205 | # facecolor='none') 206 | # ax.add_patch(rect) 207 | 208 | fig_box[page] = potential_bbox 209 | 210 | # To check if the pdf is mess up 211 | if len(potential_bbox) > 1: 212 | obj_heights = np.array(potential_bbox)[:, 3] 213 | no_of_all = len(obj_heights) 214 | no_of_small = len([1 for obj_height in obj_heights if obj_height < 13 and obj_height > 4]) 215 | small_percent = float(no_of_small) / no_of_all 216 | if no_of_all > 300 and small_percent > 0.8: 217 | info['mess_up'] = True 218 | 219 | count = 0 220 | 221 | if info['mess_up'] == False:# Need to set carefully 222 | while count < len(potential_bbox):# ###Need to think about it..... 223 | flag = 0 224 | for each_text_box in page_word_box:# Remove fig box that cross the text box 225 | overlap = overlap_ratio_based(potential_bbox[count], each_text_box) 226 | if overlap > 0.3: 227 | flag = 1 228 | del potential_bbox[count] 229 | break 230 | if flag == 0: 231 | count = count +1 232 | else: 233 | while count < len(potential_bbox):# ###Need to think about it..... 234 | flag = 0 235 | if potential_bbox[count][3]>12: 236 | for each_text_box in page_word_box:# Remove fig box that cross the text box 237 | overlap = overlap_ratio_based(potential_bbox[count], each_text_box) 238 | if overlap > 0.1: 239 | flag = 1 240 | del potential_bbox[count] 241 | break 242 | if flag == 0: 243 | count = count +1 244 | else: 245 | del potential_bbox[count] 246 | 247 | 248 | fig_box[page] = potential_bbox 249 | 250 | info['fig_no_est'] = fig_no_estimation(cap_no_clue) 251 | info['png_ratio'] = png_ratio 252 | return cap_box, fig_box, info, table_box, page_word_box 253 | 254 | def fig_no_estimation(fig_info): 255 | #print fig_info 256 | fig_no = 0 257 | temp_max = 0 258 | for clue in fig_info: 259 | if re.search(r'\d+', clue) is not None: 260 | temp_max = max(int(re.search(r'\d+', clue).group()), temp_max) 261 | fig_no = temp_max 262 | #print fig_no 263 | return fig_no 264 | 265 | def fig_cap_matching(cap_box, fig_box, info, table_box, text_box): 266 | # cap_box 267 | # fig_box 268 | # info 269 | figures = {} 270 | captions = {} 271 | fig_size_thresh = 30 272 | for i in range(info['page_no']): 273 | page = 'page' + str(i+1) + '.png' 274 | table_caps = table_box[page] 275 | 276 | p_captions = cap_box[page] 277 | p_figures = fig_box[page] 278 | for table_cap in table_caps:# To remove the table 279 | table_cap_box = [table_cap[0], table_cap[1]+table_cap[3], table_cap[2],4*info['row_height']]# Remove the table below 280 | p_figure_id = 0 281 | while p_figure_id < len(p_figures): 282 | p_figure = p_figures[p_figure_id] 283 | 284 | overlap = overlap_ratio_based(table_cap_box, p_figure) 285 | if overlap >0.1: 286 | del p_figures[p_figure_id] 287 | else: 288 | p_figure_id = p_figure_id + 1 289 | table_cap_box = [table_cap[0], table_cap[1]-4 * info['row_height'], table_cap[2], 290 | 4 * info['row_height']] # Remove the table below 291 | p_figure_id = 0 292 | while p_figure_id < len(p_figures): 293 | p_figure = p_figures[p_figure_id] 294 | 295 | overlap = overlap_ratio_based(table_cap_box, p_figure) 296 | if overlap > 0.1: 297 | del p_figures[p_figure_id] 298 | else: 299 | p_figure_id = p_figure_id + 1 300 | 301 | 302 | if len(p_figures) > 0 : 303 | if len(p_figures) ==1 and len(p_captions) ==1: 304 | if p_figures[0][2] > fig_size_thresh and p_figures[0][3] > fig_size_thresh:# size 305 | if bbox_distance(p_figures[0], p_captions[0]) < 50: # distance 306 | figures[page] = [[p_figures[0], p_captions[0]]] 307 | captions[page] = [[p_captions[0], [1, 1, info['page_width']-2, info['page_height']-2]]] 308 | if page not in figures.keys(): 309 | cap_regions = caption_regions(p_captions, p_figures, info) 310 | captions[page] = cap_regions 311 | figures[page] = label_subfig(info, p_figures, cap_regions, table_box) 312 | 313 | else: 314 | # sort captions by horizontal 315 | cap_regions = caption_regions(p_captions, p_figures, info) 316 | 317 | # Calculate the overlap of figures and cpations, the figures 318 | # belong to the same caption should have same label 319 | #print cap_regions 320 | # For the figures have the same label, compute their bounding 321 | # box 322 | captions[page] = cap_regions 323 | figures[page] = label_subfig(info, p_figures, cap_regions, table_box) 324 | if len(p_captions) == 0:# No caption situation 325 | sum_area = 0 326 | for p_object in p_figures: 327 | sum_area = sum_area + p_object[2] * p_object[3] 328 | 329 | page_width =info['page_width']-info['left_bbox'][2]-info['right_bbox'][2] 330 | page_height = info['page_height']-info['top_bbox'][3]-info['down_bbox'][3] 331 | if float(sum_area)/(page_width*page_height) > 0.2 and i>1: 332 | captions[page] = [[info['down_bbox'], [1, 1, info['page_width']-2, info['page_height']-2]]] 333 | figures[page] = label_subfig(info, p_figures, captions[page], table_box) 334 | else: 335 | captions[page] = [] 336 | figures[page] = [] 337 | 338 | return figures, captions 339 | 340 | def same_no_caps_est(cap_box, fig_box, info, table_box, text_box): 341 | 342 | cap_regions = {} 343 | figures = {} 344 | for page in cap_box: 345 | cap_regions[page] = [] 346 | if len(cap_box[page]) == 1: 347 | cap_regions[page].append([cap_box[page][0], [0, 0, info['page_width'], info['page_height']]]) 348 | if len(cap_box[page]) > 1: 349 | p_figures = fig_box[page] 350 | p_captions = cap_box[page] 351 | cap_regions[page] = caption_regions(p_captions, p_figures, info) 352 | # Calculate the overlap of figures and cpations, the figures 353 | # belong to the same caption should have same label 354 | #print cap_regions 355 | # For the figures have the same label, compute their bounding 356 | # box 357 | for page in cap_regions: 358 | p_figures = fig_box[page] 359 | p_cap_regions = cap_regions[page] 360 | figures[page] = label_subfig(info, p_figures, p_cap_regions, table_box) 361 | 362 | return figures, cap_regions 363 | 364 | def caption_regions(cap_box, fig_box, info): 365 | # sort captions by horizontal 366 | #print cap_box 367 | #whole_page = [1, 1, info['page_width'], info['page_height']] 368 | column_no = info['column_no'] 369 | columns = info['columns'] 370 | columns_point = [1] * column_no 371 | cap_regions = [] 372 | if len(cap_box) == 1: 373 | cap_regions.append([cap_box[0], [1, 1, info['page_width'] - 2*info['row_height'],info['page_height'] - 2*info['row_height']]]) 374 | # comment on 0318 for gxd 375 | ''' 376 | if column_no == 1: 377 | cap_regions.append([cap_box[0], [1, 1, info['page_width']-2, cap_box[0][1]]]) 378 | cap_regions.append([cap_box[0], [1, cap_box[0][1]+2*info['row_height'], info['page_width']-2, info['page_height']-cap_box[0][1]-3*info['row_height']]]) 379 | else: 380 | if cap_box[0][2] > info['row_width'] + 50 or (cap_box[0][0] < info['page_width'] / 2 and 381 | (cap_box[0][0] + cap_box[0][2]) > info['page_width'] / 2): 382 | cap_regions.append([cap_box[0], [1, 1, info['page_width'] - 2, cap_box[0][1]]]) 383 | cap_regions.append([cap_box[0], [1, cap_box[0][1] + 2 * info['row_height'], info['page_width'] - 2, 384 | info['page_height'] - cap_box[0][1] - 3 * info['row_height']]]) 385 | else: 386 | if cap_box[0][0]< columns[0] + 100 or cap_box[0][0] < columns[0] + info['row_width'] -100: 387 | cap_regions.append([cap_box[0], [1, 1, columns[0] + info['row_width'], cap_box[0][1]]]) 388 | cap_regions.append([cap_box[0], [1, cap_box[0][1] + 2 * info['row_height'], columns[0] + info['row_width'], 389 | info['page_height'] - cap_box[0][1] - 3 * info['row_height']]]) 390 | else: 391 | cap_regions.append([cap_box[0], [min(cap_box[0][0], columns[0] + info['row_width']+50), 1, columns[0] + info['row_width'], cap_box[0][1]]]) 392 | cap_regions.append([cap_box[0], [min(cap_box[0][0], columns[0] + info['row_width']+50), cap_box[0][1] + 2 * info['row_height'], columns[0] + info['row_width'], 393 | info['page_height'] - cap_box[0][1] - 3 * info['row_height']]]) 394 | ''' 395 | elif len(cap_box) >1: 396 | if column_no ==1: 397 | cap_sorted = sorted(cap_box, key=lambda x: x[1]) 398 | for cap_item in cap_sorted: 399 | region = [1, columns_point[0], info['page_width']-2, cap_item[1] - columns_point[0]] 400 | cap_regions.append([cap_item, region]) 401 | columns_point[0] = cap_item[1]+cap_item[3] 402 | cap_regions.append([cap_item, [1, columns_point[0], info['page_width']-2, info['page_height'] - columns_point[0]]]) 403 | else: 404 | cap_sorted = sorted(cap_box, key=lambda x: (x[1], x[0])) 405 | # caption parallel 406 | for cap_item in cap_sorted: 407 | no_cross_fig = 1 408 | if cap_item[2] > info['row_width']+50 or (cap_item[0] < info['page_width']/2 and 409 | (cap_item[0]+cap_item[2])>info['page_width']/2): 410 | no_cross_fig = 0 411 | region = [1, max(columns_point), info['page_width']-2, cap_item[1] - max(columns_point)] 412 | columns_point = [cap_item[1]+cap_item[3]] * column_no 413 | cap_regions.append([cap_item, region]) 414 | else: 415 | cap_y = cap_item[1] 416 | cap_x = cap_item[0] 417 | # for fig_item in fig_box:# To check if there are fig cross this caption 418 | # if (fig_item[1] < cap_y) & (fig_item[1] + fig_item[3] > cap_y): 419 | # no_cross_fig = 1 420 | # for other_cap in cap_sorted:# Caption parallel 421 | # if (abs(other_cap[1] - cap_y) 5* info['row_height']): 422 | # no_cross_fig = 1 423 | # no_cross_fig = 1 424 | # if (cap_item[0] + cap_item[2] > columns[0]+ info['row_width']+100) and (cap_item[0] < columns[0]+ info['row_width'] - 50): 425 | # no_cross_fig = 0 426 | # 427 | # if no_cross_fig == 0: 428 | # region = [1, max(columns_point), info['page_width']-2, cap_y - max(columns_point)] 429 | # columns_point = [cap_item[1] + cap_item[3]] * column_no 430 | # 431 | if no_cross_fig== 1: 432 | if cap_x < columns[0] + 100: 433 | region = [cap_x, columns_point[0], info['row_width'], cap_y - columns_point[0]] 434 | columns_point[0] = cap_y + cap_item[3] 435 | elif cap_x < columns[0] + info['row_width'] -100: 436 | region = [1, columns_point[0], columns[0] + info['row_width'], cap_y - columns_point[0]] 437 | columns_point[0] = cap_y + cap_item[3] 438 | else: 439 | region = [min(cap_x, columns[0] + info['row_width']+50), columns_point[1], info['page_width'] - min(cap_x, columns[0] + info['row_width']+50), cap_y - columns_point[1]] 440 | columns_point[1] = cap_y + cap_item[3] 441 | 442 | cap_regions.append([cap_item, region]) 443 | # Added to cover all area, for image below captions 444 | if no_cross_fig ==0: 445 | region = [1, max(columns_point), info['page_width'] - 2, info['page_height'] - max(columns_point)] 446 | cap_regions.append([cap_item, region]) 447 | else: 448 | cap_regions.append([cap_item, [0, columns_point[0], info['page_width']/2, info['page_height']-columns_point[0]-1]]) 449 | cap_regions.append([cap_item, [info['page_width']/2, columns_point[1], info['page_width'] / 2, 450 | info['page_height'] - columns_point[1] - 1]]) 451 | return cap_regions 452 | 453 | 454 | def label_subfig(info, figures, cap_regions, table_box): 455 | # region overlap 456 | # distance between all objects, thresh in 4 lines 457 | # objects under table box 458 | label = range(len(cap_regions)) 459 | labeled_figures = {} 460 | fig_merged = [] 461 | for i in range(len(cap_regions)): 462 | labeled_figures[str(i)] = [] 463 | 464 | # Changed order, it may affect 465 | for figure in figures: 466 | for i in range(len(cap_regions)): 467 | overlap = overlap_ratio_based(figure, cap_regions[i][1]) 468 | cover = overlap_ratio_based(cap_regions[i][0], figure)# to check if the caption in in the figure 469 | if overlap > 0.2 and cover < 0.5:# The overlap need to set carefully 470 | labeled_figures[str(i)].append(figure) 471 | 472 | # check distance, to remove far objects 473 | #if cap_regions[i][0][1] < info['down_bbox'][1]: 474 | # cap_box = [cap_regions[i][0]] 475 | # fig_objects = labeled_figures[str(i)] 476 | # for_tr_graph = [0]*len(fig_objects) 477 | # increase = -1 478 | # while increase != 0: 479 | # increase = 0 480 | # for fig_no in range(len(fig_objects)): 481 | # if for_tr_graph[fig_no]==0: 482 | # for cap in cap_box: 483 | # dis = bbox_distance(fig_objects[fig_no], cap) 484 | # if dis < 6 * info['row_height']: 485 | # cap_box.append(fig_objects[fig_no]) 486 | # for_tr_graph[fig_no] = 1 487 | # increase = increase +1 488 | # break 489 | # del cap_box[0] 490 | # labeled_figures[str(i)]= cap_box 491 | 492 | 493 | 494 | for i in range(len(cap_regions)): 495 | if len(labeled_figures[str(i)]) > 0: 496 | if len(labeled_figures[str(i)]) < 2: 497 | if labeled_figures[str(i)][0][2] > 20 and labeled_figures[str(i)][0][2] > 20:# Fig Thresh 498 | fig_merged.append([labeled_figures[str(i)][0], cap_regions[i][0]]) 499 | else: 500 | x0 = [] 501 | x1 = [] 502 | y0 = [] 503 | y1 = [] 504 | sum_figure_area = 0 505 | for each_figure in labeled_figures[str(i)]: 506 | x0.append(each_figure[0]) 507 | y0.append(each_figure[1]) 508 | x1.append(each_figure[0] + each_figure[2]) 509 | y1.append(each_figure[1] + each_figure[3]) 510 | sum_figure_area = each_figure[2] * each_figure[3] + sum_figure_area 511 | new_fig = [min(x0), min(y0), max(x1)-min(x0), 512 | max(y1)-min(y0)] 513 | # if new_fig[2] > 2*info['row_height'] and new_fig[3] > 2*info['row_height']: 514 | # fig_merged.append(new_fig) 515 | 516 | if new_fig[2] > 20 and new_fig[3] > 20:# Fig Threshold 517 | #Check overlap ratio 518 | overlap_fig = float(sum_figure_area)/(new_fig[2]*new_fig[3]) 519 | if overlap_fig > 0.1: 520 | fig_merged.append([new_fig, cap_regions[i][0]]) 521 | # fileter small one 522 | 523 | return fig_merged 524 | 525 | def evaluation(prefigures, cap_regions, html_file_path, info, html_boxes): 526 | 527 | fig_cap_pair = prefigures 528 | figures = {} 529 | captions = {} 530 | for page in fig_cap_pair: 531 | figures[page] = [] 532 | captions[page] = [] 533 | for each_figcap in fig_cap_pair[page]: 534 | new_fig = each_figcap[0] 535 | caption_flag = overlap_ratio_based(info['down_bbox'], each_figcap[1]) 536 | if caption_flag > 0.8: 537 | figcap = each_figcap[0] 538 | if info['mess_up'] == False: 539 | for each_page_html in html_boxes: 540 | if each_page_html[0] == int(page[4:-4]): 541 | for element in each_page_html[1]: 542 | in_or_not = overlap_ratio_based(element[0], figcap) 543 | if in_or_not > 0.05: 544 | new_fig = merge_two_boxes(new_fig, element[0]) 545 | 546 | for element in each_page_html[1]: 547 | in_or_not = bbox_distance(element[0], each_figcap[0]) 548 | if in_or_not < info['row_height']/4: 549 | new_fig = merge_two_boxes(new_fig, element[0]) 550 | figures[page].append([new_fig, []]) 551 | captions[page].append([]) 552 | else: 553 | figures[page].append([each_figcap[0], []]) 554 | captions[page].append([]) 555 | else: 556 | 557 | x0 = min(each_figcap[0][0],each_figcap[1][0]) 558 | y0 = min(each_figcap[0][1], each_figcap[1][1]) 559 | x1 = max(each_figcap[0][0]+each_figcap[0][2], each_figcap[1][0]+each_figcap[1][2]) 560 | y1 = max(each_figcap[0][1] + each_figcap[0][3], each_figcap[1][1]) 561 | figcap = [x0, y0, x1-x0, y1-y0] 562 | cap_box = each_figcap[1] 563 | 564 | 565 | #print fig_cap_pair[page] 566 | if info['mess_up'] == False: 567 | for each_page_html in html_boxes: 568 | if each_page_html[0] == int(page[4:-4]): 569 | 570 | for element in each_page_html[1]: 571 | in_or_not = overlap_ratio_based(element[0], figcap) 572 | if in_or_not > 0.05: 573 | new_fig = merge_two_boxes(new_fig,element[0]) 574 | 575 | for element in each_page_html[1]: 576 | in_or_not = bbox_distance(element[0], each_figcap[0]) 577 | if in_or_not < info['row_height']/4: 578 | new_fig = merge_two_boxes(new_fig, element[0]) 579 | # for caption detection ~~~~~~~~~~~~~~~~~~~~~~~~ 580 | cap_detection_flag = 0 581 | cap_text = [] 582 | cap_gap = 0.5 * info['row_height']# modify to 0.75 0.5 583 | for element in each_page_html[1]: 584 | if element[0] == cap_box or cap_detection_flag == 1: 585 | if element[0] == cap_box: 586 | cap_detection_flag = 1 587 | cap_text.append(element[1]) 588 | first_line_box = cap_box 589 | moving_box = cap_box 590 | else: 591 | cap_gap = max(min(element[0][1]-first_line_box[1]-first_line_box[3], cap_gap), 3) 592 | current_gap = element[0][1]-moving_box[1]-moving_box[3] 593 | #print current_gap 594 | #print moving_box 595 | #print element[0] 596 | if current_gap>=max(0.5 * info['row_height'],cap_gap):# 0.75*info['row_height'] 597 | cap_detection_flag = 0 598 | elif (element[0][2]-first_line_box[2]>5*info['row_height'] or element[0][3] - first_line_box[3]>1) and current_gap-cap_gap>3: 599 | cap_detection_flag = 0 600 | 601 | if abs(first_line_box[0]-element[0][0])>10*info['row_height'] and cap_detection_flag == 0: 602 | cap_detection_flag = 1 603 | elif abs(first_line_box[0]-element[0][0])>10*info['row_height'] and cap_detection_flag == 1: 604 | cap_detection_flag = 1 605 | 606 | elif cap_detection_flag == 1: 607 | moving_box = element[0] 608 | cap_box = merge_two_boxes(cap_box, element[0]) 609 | cap_text.append(element[1]) 610 | 611 | # To determine where to stop 612 | # Finding separate captions 613 | if len(cap_text)==1 and (cap_text[0][-1].isdigit() or cap_text[-1][-1].isdigit()) and len(cap_text[0])<15: 614 | cap_detection_flag = 0 615 | cap_text_cp = cap_text 616 | cap_box_cp = cap_box 617 | cap_text = [] 618 | cap_gap = 0.5 * info['row_height'] # modify to 0.75 0.5 619 | next = 0 620 | for element in each_page_html[1]: 621 | if element[0] == cap_box or cap_detection_flag == 1: 622 | if next == 0: 623 | if element[0][1]>cap_box[1] and len(element[1])>30: 624 | next = 1 625 | cap_detection_flag = 1 626 | cap_text.append(element[1]) 627 | first_line_box = element[0] 628 | moving_box = element[0] 629 | cap_box = element[0] 630 | else: 631 | cap_detection_flag = 1 632 | 633 | else: 634 | cap_gap = max( 635 | min(element[0][1] - first_line_box[1] - first_line_box[3], cap_gap), 3) 636 | current_gap = element[0][1] - moving_box[1] - moving_box[3] 637 | #print current_gap 638 | #print moving_box 639 | #print element[0] 640 | if current_gap >= max(0.5 * info['row_height'],cap_gap): # 0.75*info['row_height'] 641 | cap_detection_flag = 0 642 | elif (element[0][2] - first_line_box[2] > 5 * info['row_height'] or 643 | element[0][3] - first_line_box[3] > 1) and current_gap - cap_gap > 3: 644 | cap_detection_flag = 0 645 | 646 | if abs(first_line_box[0] - element[0][0]) > 10 * info[ 647 | 'row_height'] and cap_detection_flag == 0: 648 | cap_detection_flag = 1 649 | elif abs(first_line_box[0] - element[0][0]) > 10 * info[ 650 | 'row_height'] and cap_detection_flag == 1: 651 | cap_detection_flag = 1 652 | 653 | elif cap_detection_flag == 1: 654 | moving_box = element[0] 655 | cap_box = merge_two_boxes(cap_box, element[0]) 656 | cap_text.append(element[1]) 657 | 658 | distance_before = bbox_distance(new_fig, cap_box_cp) 659 | distance_now = bbox_distance(new_fig, cap_box) 660 | # if distance_now > 2*distance_before + 2*cap_box_cp[3]: No distance control is better 661 | # cap_box = cap_box_cp 662 | # cap_text = cap_text_cp 663 | 664 | figures[page].append([new_fig, [cap_box, cap_text]]) 665 | 666 | captions[page].append([cap_box, cap_text]) 667 | else: 668 | 669 | for each_page_html in html_boxes: 670 | if each_page_html[0] == int(page[4:-4]): 671 | cap_detection_flag = 0 672 | cap_text = [] 673 | cap_gap = info['row_height'] 674 | for element in each_page_html[1]: 675 | if element[0] == cap_box or cap_detection_flag == 1: 676 | if element[0] == cap_box: 677 | cap_detection_flag = 1 678 | cap_text.append(element[1]) 679 | first_line_box = cap_box 680 | moving_box = cap_box 681 | else: 682 | cap_gap = max(min(element[0][1]-first_line_box[1]-first_line_box[3], cap_gap), 3) 683 | current_gap = element[0][1]-moving_box[1]-moving_box[3] 684 | #print current_gap 685 | #print moving_box 686 | #print element[0] 687 | if current_gap>=max(0.5 * info['row_height'],cap_gap):# 0.75*info['row_height'] 688 | cap_detection_flag = 0 689 | elif (element[0][2]-first_line_box[2]>5*info['row_height'] or element[0][3] - first_line_box[3]>1) and current_gap-cap_gap>3: 690 | cap_detection_flag = 0 691 | 692 | if abs(first_line_box[0]-element[0][0])>10*info['row_height'] and cap_detection_flag == 0: 693 | cap_detection_flag = 1 694 | elif abs(first_line_box[0]-element[0][0])>10*info['row_height'] and cap_detection_flag == 1: 695 | cap_detection_flag = 1 696 | 697 | elif cap_detection_flag == 1: 698 | moving_box = element[0] 699 | cap_box = merge_two_boxes(cap_box, element[0]) 700 | cap_text.append(element[1]) 701 | if first_line_box[2]-element[0][2]>5*info['row_height'] and element[1].endswith('.'): 702 | cap_detection_flag = 0 703 | captions[page].append([cap_box, cap_text]) 704 | figures[page].append([each_figcap[0], [cap_box, cap_text]]) 705 | 706 | # 707 | # for page in figures: 708 | # if len(figures[page])>0: 709 | # img = cv2.imread(html_file_path + '/' + page) 710 | # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 711 | # png_size = img.shape 712 | # if png_size[0] > png_size[1]: 713 | # png_ratio = float(png_size[0]) / info['page_height'] 714 | # else: 715 | # png_ratio = float(png_size[0]) / info['page_width'] 716 | # bbox_no = 0 717 | # while bbox_no < len(figures[page]): 718 | # each_bbox = figures[page][bbox_no] 719 | # each_figure = img[int(each_bbox[1]*png_ratio):int((each_bbox[3]+each_bbox[1])*png_ratio), 720 | # int(each_bbox[0] * png_ratio):int((each_bbox[2]+each_bbox[0]) * png_ratio)] 721 | # each_figure = cv2.resize(each_figure, (200, 200)) 722 | # laplacian = cv2.Laplacian(each_figure, cv2.CV_64F) 723 | # sobelx = cv2.Sobel(each_figure, cv2.CV_64F, 1, 0, ksize=5) 724 | # sobely = cv2.Sobel(each_figure, cv2.CV_64F, 0, 1, ksize=5) 725 | # img_complexity = entropy(sobelx) + entropy(sobely) 726 | # print img_complexity 727 | # if img_complexity > 0.5: ##### need to set carefully 728 | # bbox_no = bbox_no + 1 729 | # else: 730 | # del figures[page][bbox_no] 731 | 732 | 733 | return figures, captions 734 | 735 | def check_region(info, figures, captions): 736 | final_figures = figures 737 | final_captions = captions 738 | for page in figures: 739 | for each_figure in figures[page]: 740 | if len(each_figure[1])>0: 741 | caption_overlap_ratio = overlap_ratio_based(each_figure[1][0], each_figure[0]) 742 | 743 | if (each_figure[1][0][0]+each_figure[1][0][2]) > info['right_bbox'][0]: 744 | each_figure[1][0][2] = info['right_bbox'][0]- each_figure[1][0][0] 745 | # for two column documents 746 | if each_figure[0][2] > 1.5 *info['row_width'] and each_figure[1][0][1] > each_figure[0][1] + each_figure[0][3] \ 747 | and each_figure[1][0][0] + each_figure[1][0][2]< each_figure[0][0]+ each_figure[0][2]/2 \ 748 | and each_figure[1][0][3] >3*info['row_height']: 749 | each_figure[1][0][2] = 2*each_figure[1][0][2] + 2*info['row_height'] 750 | 751 | if caption_overlap_ratio > 0.8: 752 | # spliting caption box and the figure box 753 | # top caption 754 | if each_figure[1][0][1]>=each_figure[0][1] and (each_figure[1][0][1]-each_figure[0][1])<2*info['row_height']\ 755 | and each_figure[1][0][0]each_figure[0][0]+each_figure[0][2]/2 \ 756 | and each_figure[0][1]+each_figure[0][3]-each_figure[1][0][1]-each_figure[1][0][3] > 5*info['row_height']: 757 | each_figure[0] = [each_figure[0][0], each_figure[1][0][1]+each_figure[1][0][3], 758 | each_figure[0][2], each_figure[0][1]+each_figure[0][3]-each_figure[1][0][1]-each_figure[1][0][3]] 759 | # down caption 760 | elif each_figure[0][1]+each_figure[0][3]>=each_figure[1][0][1]+each_figure[1][0][3] and (each_figure[0][1]+each_figure[0][3]-each_figure[1][0][1]-each_figure[1][0][3]) < 2 * info['row_height'] \ 761 | and each_figure[1][0][0] < each_figure[0][0] + each_figure[0][2] / 2 and \ 762 | each_figure[1][0][0] + each_figure[1][0][2] > each_figure[0][0] + each_figure[0][2] / 2 \ 763 | and each_figure[0][1] + each_figure[0][3] - each_figure[1][0][1] > 5 * info['row_height']: 764 | each_figure[0] = [each_figure[0][0], each_figure[0][1], 765 | each_figure[0][2], 766 | each_figure[0][1] + each_figure[0][3] - each_figure[1][0][1]] 767 | # right caption 768 | elif each_figure[1][0][0]+ each_figure[1][0][2]<= each_figure[0][0] + each_figure[0][2] and (each_figure[0][0] + each_figure[0][2] - each_figure[1][0][0]- each_figure[1][0][2]) < 2 * info['row_height'] \ 769 | and each_figure[1][0][0] > each_figure[0][0] + each_figure[0][2]/2 \ 770 | and each_figure[1][0][0] - each_figure[0][0] > 5 * info['row_height']: 771 | each_figure[0] = [each_figure[0][0], each_figure[0][1], 772 | each_figure[1][0][0] - each_figure[0][0], 773 | each_figure[0][3]] 774 | # left caption 775 | elif each_figure[1][0][0] >= each_figure[0][0] and (each_figure[1][0][0] - each_figure[0][0]) < 2 *info['row_height'] \ 776 | and each_figure[1][0][0] + each_figure[1][0][2]< each_figure[0][0] + each_figure[0][2] / 2\ 777 | and each_figure[0][0]+each_figure[0][2] - each_figure[1][0][0] > 5 * info['row_height']: 778 | each_figure[0] = [each_figure[1][0][0], each_figure[0][1], 779 | each_figure[0][0]+each_figure[0][2] - each_figure[1][0][0], 780 | each_figure[0][3]] 781 | 782 | return figures, captions 783 | 784 | def merge_boxes(figures, cap_regions, table_box, info): 785 | # region overlap 786 | # distance between all objects, thresh in 4 lines 787 | # objects under table box 788 | label = [-1]*len(figures) 789 | fig_merged = [] 790 | 791 | for j in range(len(figures)): 792 | figure = figures[j] 793 | for i in range(len(cap_regions)): 794 | overlap = overlap_ratio_based(figure, cap_regions[i][1]) 795 | if overlap > 0.5: 796 | label[j]=i 797 | for i in range(len(cap_regions)): 798 | index = [no for no in range(len(label)) if label[no] ==i] 799 | check_box = figures[index] 800 | dis_matrix = np.zeros(shape=(len(check_box),len(check_box))) 801 | for j in range(len(check_box)): 802 | for k in range(len(check_box)): 803 | if j ==k: 804 | dis_matrix[j][k] = 10*info['row_height'] 805 | else: 806 | dis_matrix[j][k] = manhattan_dist(check_box[j], check_box[k]) 807 | dis_matrix = min(dis_matrix) 808 | 809 | # 810 | # 811 | # for i in range(len(cap_regions)): 812 | # if len(labeled_figures[str(i)]) > 0: 813 | # if len(labeled_figures[str(i)]) < 2: 814 | # fig_merged.append(labeled_figures[str(i)][0]) 815 | # else: 816 | # x0 = [] 817 | # x1 = [] 818 | # y0 = [] 819 | # y1 = [] 820 | # for each_figure in labeled_figures[str(i)]: 821 | # x0.append(each_figure[0]) 822 | # y0.append(each_figure[1]) 823 | # x1.append(each_figure[0] + each_figure[2]) 824 | # y1.append(each_figure[1] + each_figure[3]) 825 | # 826 | # new_fig = [min(x0), min(y0), max(x1)-min(x0), 827 | # max(y1)-min(y0)] 828 | # fig_merged.append(new_fig) 829 | # 830 | # return fig_merged 831 | 832 | 833 | def overlap_ratio_based(box1, box2): 834 | # overlap ratio based on box1 835 | box1_x0 = box1[0] 836 | box1_y0 = box1[1] 837 | box1_x1 = box1[0] + box1[2] 838 | box1_y1 = box1[1] + box1[3] 839 | 840 | box2_x0 = box2[0] 841 | box2_y0 = box2[1] 842 | box2_x1 = box2[0] + box2[2] 843 | box2_y1 = box2[1] + box2[3] 844 | 845 | SI = max(0, min(box1_x1, box2_x1) - max(box1_x0, box2_x0)) * \ 846 | max(0, min(box1_y1, box2_y1) - max(box1_y0, box2_y0)) 847 | box1_area = box1[2] * box1[3] 848 | box2_area = box2[2] * box2[3] 849 | SU = box1_area + box2_area - SI 850 | if box1_area == 0: 851 | overlap_ratio = 0 852 | else: 853 | overlap_ratio = float(SI) / box1_area 854 | return overlap_ratio 855 | 856 | def bbox_distance(bbox1, bbox2): 857 | x1 = bbox1[0] 858 | y1 = bbox1[1] 859 | x1b = bbox1[0] + bbox1[2] 860 | y1b = bbox1[1] + bbox1[3] 861 | x2 = bbox2[0] 862 | y2 = bbox2[1] 863 | x2b = bbox2[0] + bbox2[2] 864 | y2b = bbox2[1] + bbox2[3] 865 | left = x2b < x1 866 | right = x1b < x2 867 | bottom = y2b < y1 868 | top = y1b < y2 869 | if top and left: 870 | return manhattan_dist((x1, y1b), (x2b, y2)) 871 | elif left and bottom: 872 | return manhattan_dist((x1, y1), (x2b, y2b)) 873 | elif bottom and right: 874 | return manhattan_dist((x1b, y1), (x2, y2b)) 875 | elif right and top: 876 | return manhattan_dist((x1b, y1b), (x2, y2)) 877 | elif left: 878 | return x1 - x2b 879 | elif right: 880 | return x2 - x1b 881 | elif bottom: 882 | return y1 - y2b 883 | elif top: 884 | return y2 - y1b 885 | else: # rectangles intersect 886 | return 0 887 | 888 | def manhattan_dist(a, b): 889 | return sum(abs(a-b) for a,b in zip(a, b)) 890 | 891 | def merge_two_boxes(bbox1, bbox2): 892 | x0 = min(bbox1[0], bbox2[0]) 893 | y0 = min(bbox1[1], bbox2[1]) 894 | x1 = max(bbox1[0]+bbox1[2], bbox2[0]+bbox2[2]) 895 | y1 = max(bbox1[1] + bbox1[3], bbox2[1] + bbox2[3]) 896 | return [x0, y0, x1-x0, y1-y0] 897 | --------------------------------------------------------------------------------