├── code
    ├── Readme
    ├── renderer.py
    ├── FigCap.py
    ├── pdf_info.py
    └── xpdf_process.py
├── data
    └── Readme
├── README.md
└── LICENSE


/code/Readme:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data/Readme:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/code/renderer.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import os, sys, re
 3 | from PIL import Image
 4 | import numpy as np
 5 | import tempfile
 6 | 
 7 | #output_dpi = str(72)
 8 | 
 9 | 
10 | def render_pdf(filename, customize_dpi):
11 |     """
12 |         This function renders the document unsing imagemagick and returns a list of images, one for each page.
13 |         The images are PIL Image type.
14 |     """
15 |     output_dpi = str(customize_dpi)
16 |     sep = os.path.sep
17 |     # splitted = filename.split(sep)
18 |     # t = splitted[len(splitted) - 1]
19 |     # fname = t.split('.')[0]
20 |     # currDir = os.getcwd()
21 |     # outputDir = currDir + sep + fname + sep
22 |     # os.mkdir(outputDir, 0755)
23 |     outputDir = tempfile.mkdtemp()
24 |     
25 | 
26 |     rasterScale = 3  # increase this if you want higher resolution images 
27 |     rasterDensity = str(rasterScale * 100) 
28 | 
29 |     # If you have your path setup correctly in 'nix this should work,
30 |     # right now its set up to have explicit path in windows
31 |     if os.name == 'nt':
32 |         imagemagickPath = '/usr/pengyuan/others/ImageMagick-7.0.3-5-portable-Q16-x86/convert.exe'
33 |         os.system(
34 |             imagemagickPath + ' -density ' + rasterDensity + ' -resample ' + output_dpi + ' -set colorspace RGB ' +
35 |             filename + ' ' + os.path.join(outputDir, 'image.png'))
36 |     else:
37 |         os.system(
38 |             # 'convert -density ' + rasterDensity + ' -resample ' + output_dpi + ' -set colorspace RGB ' + filename + ' ' + outputDir + 'image.png')
39 |             #'convert -density ' + output_dpi + ' -resample ' + output_dpi + ' -set colorspace RGB ' + filename + ' ' + outputDir + 'image.png')
40 |             'gs -q -sDEVICE=png16m -o ' + os.path.join(outputDir, 'file-%02d.png') + ' -r' + output_dpi + ' ' + filename)
41 | 
42 |     files = [f for f in os.listdir(outputDir) if os.path.isfile(os.path.join(outputDir, f)) and not f.startswith('.')]
43 |     files = natural_sort(files)
44 |     images = []
45 |     for f in files:
46 |         if f.endswith('.png'):
47 |             pageIm = Image.open(os.path.join(outputDir, f)).convert('RGB')
48 |             pageIm.load()  # load into memory (also closes the file associated)
49 |             images.append(pageIm)
50 |     shutil.rmtree(outputDir)
51 |     return images
52 | 
53 | 
54 | def natural_sort(l): # this is taken from stack overflow.
55 |     """
56 |         This function will sort strings with numeric values in natural ascending order, 
57 |         such that it does not go 1,11,2 etc.
58 |     """
59 |     convert = lambda text: int(text) if text.isdigit() else text.lower()
60 |     alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
61 |     return sorted(l, key=alphanum_key)
62 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PDFigCapX
 2 | Website:  
 3 | https://www.eecis.udel.edu/~compbio/PDFigCapX  
 4 | Our paper at:  
 5 | https://doi.org/10.1093/bioinformatics/btz228  
 6 | 
 7 | ## Codes 
 8 | **Main function:**  
 9 | Command: python /code/FigCap.py  
10 | Inputs:  
11 | *input_path*: The folder contains PDF files that need to be parsed.  
12 | *output_path*: The folder where parsing results will be saved.  
13 | Outputs:  
14 | For each document in the *input_path*, the main function will generate a corresponding folder with the same name as the original document in the *output_path*. All extracted figures (in jpg format), captions (in text format) and their coordinate information (in json format) will be saved in the corresponding folder.  
15 | 
16 | **Explainations of submodules called by FigCap.py**  
17 | **Submodule 1:** PDF parsing using xpdf (Lines 51-57 in FigCap.py)  
18 | Inputs:  
19 | *Xpdf_path*: The path to the xpdf code. In our implementation, the path is set to /usa/pengyuan/Documents/RESEARCH/xpdf-tools-linux-4.00/bin/pdftohtml. This code is also available online at: https://www.xpdfreader.com/pdftohtml-man.html.  
20 | *pdf_path*: The document that needs to be parsed.  
21 | *html_file_path*: The folder contains all parsing results obtained by using xpdf.  
22 | Outputs:  
23 | A folder with the same name as the PDF document will be created in the *html_file_path*. For each page in the document, an HTML file containing the textual information and an text-striped image will be saved in the newly created folder.  
24 | 
25 | **Submodule 2:** Obtain layout information (/code/pdf_info.py)  
26 | Inputs:  
27 | *pdf_path*: The path to the PDF file.  
28 | *html_file_path: The path to the folder generated by xpdf in Submodule 1.  
29 | Outputs:  
30 | A json file containing all layout information and textual information is saved in *html_file_path* and this information will be passed to Submodule 3.  
31 | 
32 | **Submodule 3:** Disambiguation of figures and captions (/code/xpdf_process.py)  
33 | Inputs:  
34 | *pdf_path:* The path to the PDF file.  
35 | *html_file_path: The path to the folder generated by xpdf in Submodule 1.  
36 | Outputs:  
37 | A python dictionary that contains the coordinate information of all identified figures and captions.  
38 | 
39 | **Submodule 4:** Extract figures and captions as files (Lines 78-120 in FigCap.py)  
40 | Inputs:  
41 | *data[pdf]*: The python dictionary that contains the coordinates of all identified figures and captions. The variable *pdf* is the ID of the PDF file.  
42 | *Output_path_file*: The path that figures and captions will be saved.  
43 | Outputs:  
44 | Figures (in jpg format), captions (in text format) and their coordinate information (in a json format) will be saved in the *output_path_folder* folder.  
45 | 
46 | ## Datasets
47 | ### GXD-200 dataset:  
48 | Dataset path: /datasets/GXD200  
49 | Description: The GXD-200 dataset contains 200 documents selected from a collection curated by Jackson Lab's Gene Expression Database. There are 1335 figures, 1298 figure-associated captions, and 1298 figure and caption pairs in this dataset.  
50 | Ground-truth annotations: /datasets/GXD200/GT_GXD200.json  
51 | 
52 | ### PMD-200 dataset:  
53 | Dataset path: /datasets/PMC200  
54 | Description: The PMC-200 dataset contains 200 biomedical documents selected from the PubMed Central (PMC) Open Access Subset (2018). There are 1042 figures, 1032 captions, and 1032 figure and caption pairs in this dataset.  
55 | Ground-truth annotations: /datasets/PMC200/GT_PMC200.json  
56 | 


--------------------------------------------------------------------------------
/code/FigCap.py:
--------------------------------------------------------------------------------
  1 | """
  2 | main code page
  3 | structure (xpdf_process):
  4 | 1. Read pdfs from input folder
  5 | 2. Figure and caption pair detection
  6 |     2.1. graphical content detection
  7 |     2.2 page segmentation
  8 |     2.3 figure detetion
  9 |     2.4 caption association
 10 | 
 11 | 3. Mess up pdf processing
 12 | 
 13 | 
 14 | Writen by Pengyuan Li
 15 | 
 16 | Start from 19/10/2017
 17 | 1.0 version 28/02/2018
 18 | 
 19 | """
 20 | 
 21 | import os
 22 | import json
 23 | from pprint import pprint
 24 | import renderer
 25 | import matplotlib.pyplot as plt
 26 | import matplotlib.patches as patches
 27 | from xpdf_process import figures_captions_list
 28 | import subprocess
 29 | import os
 30 | import time
 31 | 
 32 | if __name__ == "__main__":
 33 | 
 34 |     input_path = '/eecis/shatkay/homes/pengyuan/Documents/RESEARCH/PDFigCapX/code/sample_data_for_Juan'
 35 |     output_path = '/eecis/shatkay/homes/pengyuan/Documents/RESEARCH/PDFigCapX/code/sample_data_for_Juan'
 36 |     xpdf_path = output_path +'/xpdf/'  
 37 |     log_file = output_path + '/log.text'
 38 |     f_log = open(log_file, 'w') 
 39 |     if not os.path.isdir(xpdf_path):
 40 |         os.mkdir(xpdf_path)
 41 | # Read each files in the input path
 42 |     for pdf in os.listdir(input_path):
 43 |         if pdf.endswith('.pdf') and (not pdf.startswith('._')):
 44 |             data = {}
 45 |             print input_path+pdf
 46 |             images = renderer.render_pdf(input_path + '/' + pdf)
 47 |             data[pdf] = {}
 48 |             data[pdf]['figures'] = []
 49 |             data[pdf]['pages_annotated'] = []
 50 |             pdf_flag = 0
 51 |             try:
 52 |                 if not os.path.isdir(xpdf_path+pdf[:-4]):
 53 |                     std_out = subprocess.check_output(["/usa/pengyuan/Documents/RESEARCH/PDFigCapX/xpdf-tools-linux-4.00/bin64/pdftohtml", input_path+'/'+pdf, xpdf_path+pdf[:-4]+'/'])
 54 |             except:
 55 |                 print "\nWrong "+pdf+"\n"
 56 |                 f_log.write(pdf+'\n')
 57 |                 pdf_flag = 1
 58 | 
 59 |             if pdf_flag == 0:
 60 |                 flag = 0
 61 |                 wrong_count = 0
 62 |                 while flag==0 and wrong_count<5:
 63 |                     try:
 64 |                         figures, info = figures_captions_list(input_path, pdf, xpdf_path)
 65 |                         flag = 1
 66 | 
 67 |                     except:
 68 |                         wrong_count = wrong_count +1
 69 |                         time.sleep(5)
 70 |                         print pdf
 71 |                         info['fig_no_est']=0
 72 |                         figures = []
 73 |                         print "------\nChrome Error\n----------\n"
 74 |         
 75 |                 data[pdf]['fig_no'] = info['fig_no_est']
 76 | 
 77 |                 output_file_path = output_path +'/' + pdf[:-4]
 78 |                 if not os.path.isdir(output_file_path):
 79 |                     os.mkdir(output_file_path)      
 80 | 
 81 |                 for figure in figures:
 82 |                     page_no = int(figure[:-4][4:])
 83 |                     page_fig= images[page_no -1]
 84 |                     rendered_size = page_fig.size
 85 | 
 86 |                     bboxes = figures[figure]
 87 |                     order_no = 0
 88 |                     for bbox in bboxes:
 89 |                         order_no = order_no + 1
 90 |                         png_ratio = float(rendered_size[1])/info['page_height']
 91 |                         print(png_ratio)
 92 | 
 93 |                         if len(bbox[1])>0:
 94 |                             data[pdf]['figures'].append({'page': page_no,
 95 |                                           'region_bb': bbox[0],
 96 |                                          'figure_type': 'Figure',
 97 |                                         'page_width': info['page_width'],
 98 |                                         'page_height': info['page_height'],
 99 |                                         'caption_bb': bbox[1][0],
100 |                                         'caption_text': bbox[1][1]
101 |                                          })
102 |                             with open(output_file_path+'/'+str(page_no)+'_'+str(order_no)+'.txt', 'w') as capoutput:
103 |                                 capoutput.write(str(bbox[1][1]))
104 |                                 capoutput.close
105 |                         else:
106 |                             data[pdf]['figures'].append({'page': page_no,
107 |                                                      'region_bb': bbox[0],
108 |                                                      'figure_type': 'Figure',
109 |                                                      'page_width': info['page_width'],
110 |                                                      'page_height': info['page_height'],
111 |                                                      'caption_bb': [],
112 |                                                      'caption_text': []
113 |                                                      })
114 |                         fig_extracted = page_fig.crop([int(bbox[0][0]*png_ratio), int(bbox[0][1]*png_ratio), 
115 |                                         int((bbox[0][0]+bbox[0][2])*png_ratio), int((bbox[0][1]+bbox[0][3])*png_ratio)])
116 |                         fig_extracted.save(output_file_path+'/'+str(page_no)+'_'+str(order_no)+'.jpg')
117 | 
118 |                 pprint(data)
119 |                 json_file = output_file_path+'/'+ pdf[:-4]+'.json'
120 |                 with open(json_file, 'w') as outfile:
121 |                     json.dump(data, outfile)
122 | 
123 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/code/pdf_info.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | pdf_info is to get the basic infomation from pdfs
  3 | info={
  4 | filename, height, width, page_no, figure_est_no, layout_bbox, text_mask
  5 | }
  6 | '''
  7 | from selenium import webdriver
  8 | from multiprocessing import Pool, TimeoutError
  9 | import time
 10 | import os
 11 | import json
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | import sys
 15 | import cv2
 16 | 
 17 |     # Column width, middle gap, Maximum Figure number will be helpful
 18 | def pdf_info(html_file_path, pdf):
 19 |     # Get the pdf info by parsing html
 20 |     
 21 |     info = {}
 22 | # obtain file name
 23 |     info['filename'] = pdf
 24 | # obtain page no
 25 |     for_counting = []
 26 |     for page in os.listdir(html_file_path):
 27 |         if page.endswith('.png') & page.startswith('page'):
 28 |             for_counting.append(page)
 29 |     page_no = len(for_counting)
 30 |     for_counting = sorted(for_counting)
 31 |     info['page_no'] = page_no
 32 | # Obtain all html information
 33 |     list_of_htmls = []
 34 |     html_info = []
 35 |     html_info_json = html_file_path+'/' + pdf[:-4] + '.json'
 36 |     if os.path.isfile(html_info_json):
 37 |         with open(html_info_json) as json_data:
 38 |             html_info = json.load(json_data)
 39 |     else:
 40 |         browser = webdriver.Chrome('/usa/pengyuan/Documents/RESEARCH/PDFigCapX/chromedriver/chromedriver')
 41 |         for page_id in range(page_no):
 42 |             page = for_counting[page_id]
 43 |             html_file = 'file://' + html_file_path + '/' + page[:-4] + '.html'
 44 |             list_of_htmls.append(html_file)
 45 |             browser.get(html_file)
 46 |             page_layout = browser.find_element_by_xpath("/html/body/img")
 47 |             img_size = (page_layout.size['height'], page_layout.size['width'])
 48 |             text_elements = browser.find_elements_by_xpath("/html/body/div")
 49 |             text_boxes = []
 50 |             for element in text_elements:
 51 |                 text = element.text
 52 |                 if len(text) > 0:
 53 |                     text_boxes.append([[element.location['x'], element.location['y'], element.size['width'], element.size['height']], text])
 54 |             html_info.append([int(os.path.basename(html_file)[4:-5]), text_boxes, img_size])
 55 |         browser.quit()
 56 |             #html_info.append(read_each_html(html_file))
 57 |         with open(html_file_path+'/' + pdf[:-4] + '.json', 'w') as outfile:
 58 |             json.dump(html_info, outfile)
 59 |     #multithread = Pool(4)
 60 |     #html_info = multithread.map(read_each_html, list_of_htmls)
 61 |     #multithread.close()
 62 |     #multithread.join()
 63 | # obtain text layout
 64 |     row_width = []
 65 |     row_height = []
 66 |     column_no = 1
 67 |     columns = [0]
 68 |     left_point = []
 69 |     top_point = []
 70 |     right_point = []
 71 | 
 72 |     if page_no > 3:
 73 |         list_to_check = range(2, page_no)
 74 |     else:
 75 |         list_to_check = range(1, page_no+1)
 76 |     for each_page_html in html_info:
 77 |         if each_page_html[0] in list_to_check:
 78 |             #print each_page_html[0]
 79 | # Obtain page convas region
 80 |             info['page_height'] = each_page_html[2][0]
 81 |             info['page_width'] = each_page_html[2][1]
 82 |             for element in each_page_html[1]:
 83 |                 if len(element[1]) > 30:
 84 |                     row_width.append(element[0][2])
 85 |                     row_height.append(element[0][3])
 86 |                     left_point.append(element[0][0])
 87 |                     right_point.append(element[0][0]+element[0][2])
 88 |                     top_point.append(element[0][1])
 89 |     point_left = sorted([(i, left_point.count(i)) for i in set(
 90 |         left_point)], key=lambda x: x[1], reverse=True)
 91 |     width_row = sorted([(i, row_width.count(i)) for i in set(
 92 |         row_width)], key=lambda x: x[1], reverse=True)
 93 |     height_row = sorted([(i, row_height.count(i)) for i in set(
 94 |         row_height)], key=lambda x: x[1], reverse=True)
 95 |     info['row_height'] = height_row[0][0]
 96 |     info['row_width'] = width_row[0][0]
 97 |     info['text_layout'] = (max(0, min(top_point)),
 98 |                            min(info['page_height'],max(top_point)))
 99 | 
100 |     # Compute column no and position for each column
101 |     i = 0
102 |     while i < len(point_left):
103 |         j = i + 1
104 |         while j < len(point_left):
105 |             if abs(point_left[i][0] - point_left[j][0]) <= 10:
106 |                 point_left[i] = (point_left[i][0], point_left[i][1] +
107 |                                  point_left[j][1])
108 |                 del point_left[j]
109 |             else:
110 |                 j = j + 1
111 |         i = i + 1
112 |     point_left = sorted(point_left, key=lambda x: x[1], reverse=True)
113 | 
114 |     if float(point_left[0][1]) / len(left_point) > 0.75 \
115 |             or float(info['row_width']) / info['page_width'] > 0.5:
116 |         column_no = 1
117 |         columns = [point_left[0][0]]
118 |     else:  # float(point_left[1][1]) / len(left_point) > 0.2:  # Need to
119 |         # correct, it may cause numbe below 0
120 |         column_no = 2  # int(float((info['page_width'] - 2*point_left[0][0]))/info['row_width'])
121 | 
122 |         for i in range(1, len(point_left)):
123 |             if abs(point_left[i][0] - point_left[0][0]) > info['row_width']:
124 |                 columns = [min(point_left[i][0], point_left[0][0]),
125 |                            max(point_left[i][0], point_left[0][0])]
126 |                 break
127 | 
128 |     info['column_no'] = column_no
129 |     info['columns'] = columns
130 | 
131 |     left_bar = min(left_point)
132 |     right_bar = max(right_point)
133 |     # pdf layout
134 |     if left_bar > 0 and left_bar < 20 * info['row_height']:
135 |         info['left_bbox'] = [0, 0, left_bar, info['page_height']]
136 |         info['right_bbox'] = [min(info['page_width'] - 2 * info['row_height'],right_bar),
137 |                               0, info['page_width'] - min(info['page_width'] - 2 * info['row_height'],right_bar), info['page_height']]
138 |         if info['text_layout'][0] < 15 * info['row_height'] and info['text_layout'][1] > 15 * info['row_height']:
139 |             info['top_bbox'] = [0, 0, info['page_width'], info['text_layout'][0]]
140 |             info['down_bbox'] = [0, info['text_layout'][1], info['page_width'],
141 |                                  info['page_height'] - info['text_layout'][1]]
142 |         else:
143 |             info['top_bbox'] = [0, 0, info['page_width'], info['row_height']]
144 |             info['down_bbox'] = [0, info['page_height'] - info['row_height'], info['page_width'],
145 |                                  info['row_height']]
146 |     else:
147 |         info['left_bbox'] = [0, 0, info['row_height'], info['page_height']]
148 |         info['right_bbox'] = [info['page_width'] - info['row_height'], 0, info['row_height'], info['page_height']]
149 |         info['top_bbox'] = [0, 0, info['page_width'], info['row_height']]
150 |         info['down_bbox'] = [0, info['page_height'] - info['row_height'], info['page_width'], info['row_height']]
151 | 
152 |     #print info['left_bbox']
153 |     #print info['right_bbox']
154 |     #print info['top_bbox']
155 |     #print info['down_bbox']
156 |     info['mess_up'] = False
157 |     info['graph_layout'] = info['text_layout']
158 | 
159 |     return info, html_info
160 | '''
161 |            
162 | 
163 |     
164 |         page_layout = browser.find_element_by_xpath("/html/body/img")
165 |         info['page_height'] = page_layout.size['height']
166 |         info['page_width'] = page_layout.size['width']
167 | 
168 |         text_elements = browser.find_elements_by_xpath("/html/body/div")
169 |         for element in text_elements:
170 |             if len(element.text) > 30:
171 |                 row_width.append(element.size['width'])
172 |                 row_height.append(element.size['height'])
173 |                 left_point.append(element.location['x'])
174 |                 top_point.append(element.location['y'])
175 | 
176 |     point_left = sorted([(i, left_point.count(i)) for i in set(
177 |             left_point)], key=lambda x: x[1], reverse=True)
178 |     width_row = sorted([(i, row_width.count(i)) for i in set(
179 |             row_width)], key=lambda x: x[1], reverse=True)
180 |     height_row = sorted([(i, row_height.count(i)) for i in set(
181 |             row_height)], key=lambda x: x[1], reverse=True)
182 |     info['row_height'] = height_row[0][0]
183 |     info['row_width'] = width_row[0][0]
184 |     info['text_layout'] = (max(0, min(top_point)),
185 |                                min(info['page_height'],
186 |                                    max(top_point)))
187 |     # Compute column no and position for each column
188 |     i = 0
189 |     while i < len(point_left):
190 |         j = i + 1
191 |         while j < len(point_left):
192 |             if abs(point_left[i][0] - point_left[j][0]) <= 10:
193 |                 point_left[i] = (point_left[i][0], point_left[i][1] +
194 |                                      point_left[j][1])
195 |                 del point_left[j]
196 |             else:
197 |                 j = j + 1
198 |         i = i + 1
199 |     point_left = sorted(point_left, key=lambda x: x[1], reverse=True)
200 | 
201 |     if float(point_left[0][1]) / len(left_point) > 0.75\
202 |                 or float(info['row_width'])/info['page_width'] > 0.5:
203 |         column_no = 1
204 |         columns = [point_left[0][0]]
205 |     else:  # float(point_left[1][1]) / len(left_point) > 0.2:  # Need to
206 |             # correct, it may cause numbe below 0
207 |         column_no = 2 #int(float((info['page_width'] - 2*point_left[0][0]))/info['row_width'])
208 | 
209 |         for i in range(1, len(point_left)):
210 |             if abs(point_left[i][0] - point_left[0][0]) > info['row_width']:
211 |                 columns = [min(point_left[i][0], point_left[0][0]),
212 |                        max(point_left[i][0], point_left[0][0])]
213 |                 break
214 | 
215 |     info['column_no'] = column_no
216 |     info['columns'] = columns
217 | 
218 |     left_bar = min(left_point)
219 |     # pdf layout
220 |     if left_bar >0 and left_bar < 20*info['row_height']:
221 |         info['left_bbox'] = [0, 0, left_bar, info['page_height']]
222 |         info['right_bbox'] = [info['page_width']-2*info['row_height'],
223 |                               0, 2*info['row_height'], info['page_height']]
224 |         if info['text_layout'][0] < 15*info['row_height'] and info['text_layout'][1] > 15*info['row_height']:
225 |             info['top_bbox'] = [0, 0, info['page_width'], info['text_layout'][0]]
226 |             info['down_bbox'] = [0, info['text_layout'][1], info['page_width'], info['page_height']-info['text_layout'][1]]
227 |         else:
228 |             info['top_bbox'] = [0, 0, info['page_width'], info['row_height']]
229 |             info['down_bbox'] = [0, info['page_height']-info['row_height'], info['page_width'],
230 |                                  info['row_height']]
231 |     else:
232 |         info['left_bbox'] = [0, 0, info['row_height'], info['page_height']]
233 |         info['right_bbox'] = [0, info['page_width'] - info['row_height'], info['row_height'], info['page_height']]
234 |         info['top_bbox'] = [0, 0, info['page_width'], info['row_height']]
235 |         info['down_bbox'] = [0, info['page_height']-info['row_height'], info['page_width'], info['row_height']]
236 | 
237 |     print info['left_bbox']
238 |     print info['right_bbox']
239 |     print info['top_bbox']
240 |     print info['down_bbox']
241 |     # graph layout
242 |     #
243 |     # if page_no >1:
244 |     #     previous_page = for_counting[list_to_check[0]]
245 |     #     previous_img = cv2.imread(html_file_path + '/' + previous_page)
246 |     #     previous_img = previous_img <240
247 |     #
248 |     #     for page_id in list_to_check[1:]:
249 |     #         page = for_counting[page_id]
250 |     #         img = cv2.imread(html_file_path + '/' + page)
251 |     #         img = img <240
252 |     #         result = img & previous_img
253 |     #         temp_result = result[:, :, 0]
254 |     #         previous_img = result
255 |     #         # xor pages to find the top/bottom line
256 |     #     sum_result = [ sum(each_row) for each_row in temp_result]
257 |     #     sum_result = [i for i in range(len(sum_result)) if sum_result[i] > 0]
258 |     #     top_point = min(sum_result)
259 |     #     bottom_point = max(sum_result)
260 |     #
261 |     #     info['graph_layout'] = info['text_layout']
262 |     # else:ue
263 |     #     info['graph_layout'] = info['text_layout']#(top, down)
264 |     info['mess_up'] = False
265 |     info['graph_layout'] = info['text_layout']
266 | '''
267 | 
268 | 
269 | 
270 | def read_each_html(x):
271 |     #browser = webdriver.Chrome('/home/pengyuan/chromedriver')
272 |     #browser = webdriver.Chrome('/usa/pengyuan/Documents/RESEARCH/PDFigCapX/chromedriver/chromedriver')
273 |     #browser.implicitly_wait(2)
274 |     browser.get(x)
275 |     page_layout = browser.find_element_by_xpath("/html/body/img")
276 |     img_size = (page_layout.size['height'], page_layout.size['width'])
277 |     text_elements = browser.find_elements_by_xpath("/html/body/div")
278 |     text_boxes = []
279 |     for element in text_elements:
280 |         text = element.text
281 |         if len(text) > 0:
282 |             text_boxes.append([[element.location['x'], element.location['y'], element.size['width'], element.size['height']], text])
283 | 
284 |     browser.quit()
285 |     return int(os.path.basename(x)[4:-5]), text_boxes, img_size
286 | 


--------------------------------------------------------------------------------
/code/xpdf_process.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | The main code for figure and caption extraction (figures_captions_list)
  3 | 1. Read pdfs from input folder  (pdf_info)
  4 | 2. Figure and caption pair detection
  5 | 
  6 |     2.1. graphical content detection
  7 |     2.2 page segmentation
  8 |     2.3 figure detetion
  9 |     2.4 caption association
 10 | 
 11 | 3. Mess up pdf processing
 12 | 
 13 | '''
 14 | 
 15 | import subprocess
 16 | import os
 17 | import numpy as np
 18 | import matplotlib.pyplot as plt
 19 | import sys
 20 | import cv2
 21 | import codecs
 22 | import matplotlib.patches as patches
 23 | import scipy.misc
 24 | import re
 25 | from lxml import etree
 26 | from selenium import webdriver
 27 | from pdf_info import pdf_info
 28 | 
 29 | 
 30 | 
 31 | def figures_captions_list(input_path, pdf, output_path):
 32 | # input: single pdf file
 33 | # output: bounding box list of figures and captions
 34 |     pdf_filename = input_path + pdf
 35 |     html_file_path = output_path + pdf[:-4]
 36 | # 1. Read pdfs from input folder  (pdf_info)
 37 |     info, html_boxes = pdf_info(html_file_path, pdf)
 38 | #  2.1. graphical content detection
 39 |     cap_box, fig_box, info, table_box, text_box = box_detection(html_file_path, info, html_boxes)
 40 |     pre_figures, cap_regions = fig_cap_matching(cap_box, fig_box, info, table_box, text_box)
 41 |     figures, captions = evaluation(pre_figures, cap_regions, html_file_path, info, html_boxes) # Remove figure_table and figure caption in one box
 42 |     figures, captions = check_region(info, figures, captions)
 43 |     no_of_figures = sum([len(figures[x]) for x in figures])
 44 |     no_of_caps = sum([len(cap_box[x]) for x in cap_box])
 45 |     no_of_figs = sum([len(fig_box[x]) for x in fig_box])
 46 |     #print info['filename']
 47 |     #print info['mess_up']
 48 |     #print info['fig_no_est']
 49 | 
 50 |     #
 51 |     # print no_of_figures
 52 |     # if no_of_figures == no_of_caps:
 53 |     #     figures, cap_regions = same_no_caps_est(cap_box, fig_box, info, table_box, text_box)
 54 |     #
 55 |     r = info['png_ratio']
 56 |     # plt.close("all")
 57 |     # for i in range(info['page_no']):
 58 |     #     page = 'page' + str(i + 1) + '.png'
 59 |     #     img = cv2.imread(html_file_path + '/' + page)
 60 |     #     fig, ax = plt.subplots(1)
 61 |     #     ax.imshow(img)
 62 |     #     for each_caption in cap_box[page]:
 63 |     #         rect = patches.Rectangle((each_caption[0]*r, each_caption[1]*r), each_caption[2]*r, each_caption[3]*r,
 64 |     #                                  linewidth=1, edgecolor='g',
 65 |     #                                  facecolor='none')
 66 |     #         ax.add_patch(rect)
 67 |     #
 68 |     #     for each_fig in fig_box[page]:
 69 |     #         #each_fig = each_fig[0]
 70 |     #         rect = patches.Rectangle((each_fig[0]*r, each_fig[1]*r), each_fig[2]*r, each_fig[3]*r,
 71 |     #                                  linewidth=2, edgecolor='b',
 72 |     #                                  facecolor='none')
 73 |     #         ax.add_patch(rect)
 74 |     #     for each_cap_region in cap_regions[page]:
 75 |     #         rect = patches.Rectangle((each_cap_region[1][0]*r, each_cap_region[1][1]*r), each_cap_region[1][2]*r, each_cap_region[1][3]*r,
 76 |     #                                  linewidth=1, edgecolor='y',
 77 |     #                                  facecolor='none')
 78 |     #         ax.add_patch(rect)
 79 |     #     for each_result in figures[page]:
 80 |     #         each_result = each_result[0]
 81 |     #         rect = patches.Rectangle((each_result[0]*r, each_result[1]*r), each_result[2]*r, each_result[3]*r,
 82 |     #                                  linewidth=1, edgecolor='r',
 83 |     #                                  facecolor='none')
 84 |     #         ax.add_patch(rect)
 85 |     #     plt.show()
 86 |     return figures, info
 87 | 
 88 | 
 89 | 
 90 | def box_detection(html_file_path, info, html_boxes):
 91 |     fig_box = {}
 92 |     cap_box = {}
 93 |     word_box = {}
 94 |     cap_no_clue = []
 95 |     table_box={}
 96 |     #browser = webdriver.Chrome('/home/pengyuan/Documents/FC_extraction/chromedriver')
 97 | 
 98 |     for page in sorted(os.listdir(html_file_path)):
 99 |         if page.endswith('.png') and page.startswith('page'):
100 |         
101 |             page_no = int(page[4:-4])
102 |             img = cv2.imread(html_file_path + '/' + page)
103 |             # plt.imshow(img)
104 |             png_size = img.shape
105 |             if png_size[0] > png_size[1]:
106 |                 png_ratio = float(png_size[0]) / info['page_height']
107 |             else:
108 |                 png_ratio = float(png_size[0]) / info['page_width']
109 | 
110 |             # Read each page html find "Fig"
111 |             # f = codecs.open(html_file_path + '/' + page[:-4] + '.html', 'r')
112 |             # text = f.readline()
113 |             # html_file = 'file://' + html_file_path + '/' + page[:-4] + '.html'
114 |             # browser.get(html_file)
115 | 
116 |             text = ''
117 |             text_box = []
118 |             page_word_box = []
119 |             table_cap_box = []
120 |             div_no = 1
121 |             for page_html in html_boxes:
122 |                 if page_html[0] == page_no:
123 |                     text_elements = page_html[1]
124 | 
125 |             for e in text_elements:
126 |                 text = e[1]
127 |                 #if e.size['width'] > info['row_width']-100:
128 |                 page_word_box.append([max(e[0][0]-info['row_height'], 0)
129 |                                         , e[0][1]
130 |                                         , e[0][2]+2*info['row_height'],
131 |                                      e[0][3]])
132 |                 if text.startswith('Table') or text.startswith('table') or text.startswith('Box'):
133 |                     table_cap_box.append([e[0][0], e[0][1], e[0][2], e[0][3]])
134 |                 if text.startswith('Fig') or text.startswith('fig') or text.startswith('FIG'):
135 |                     #print text
136 |                     text_box.append([e[0][0], e[0][1], e[0][2], e[0][3]])
137 |                     cap_no_clue.append(text)
138 |                 elif 'Fig' not in text and len(text) > 6:
139 |                     text = text[:6]
140 |                     idx1 = text.find('F')
141 |                     idx2 = text.find('i')
142 |                     idx3 = text.find('g')
143 |                     if idx1>= 0 and idx2>=0 and idx3>= 0 and idx2>idx1 and idx3>idx2:
144 |                         #print text
145 |                         text_box.append([e[0][0], e[0][1], e[0][2], e[0][3]])
146 |                     # rect = patches.Rectangle((e.location['x'] * png_ratio, e.location['y'] * png_ratio),
147 |                     #                          e.size['width'] * png_ratio,
148 |                     #                          e.size['height'] * png_ratio,
149 |                     #                          linewidth=1, edgecolor='b',
150 |                     #                          facecolor='none')
151 |                     # ax.add_patch(rect)
152 | 
153 | 
154 |             cap_box[page] = text_box
155 |             table_box[page] = table_cap_box
156 |             word_box[page] = page_word_box
157 |         
158 |             imgray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
159 |             ret, thresh = cv2.threshold(imgray, 240, 255, cv2.THRESH_BINARY_INV)
160 |             kernel = np.ones((5, 5), np.uint8)
161 |             dilation = cv2.dilate(thresh, kernel, iterations=1)
162 |             contours, hierarchy = cv2.findContours(dilation,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
163 |             new_thresh = np.zeros(thresh.shape, dtype=np.uint8)
164 |       
165 |             for cnt in contours:
166 |                 bbox = cv2.boundingRect(cnt)
167 |                 p_bbox = [int(float(x) / png_ratio) for x in bbox]
168 |                 box_image = 0
169 |                 for caption_box in text_box:
170 |                     box_image = box_image + overlap_ratio_based(caption_box, p_bbox)
171 |                 if box_image < 0.5:
172 |                     cv2.drawContours(new_thresh, [cnt], 0, 255, -1)
173 | 
174 |             contours, hierarchy = cv2.findContours(new_thresh,
175 |                                                           cv2.RETR_EXTERNAL,
176 |                                                           cv2.CHAIN_APPROX_SIMPLE)
177 | 
178 |             # scipy.misc.imsave('thresh.jpg', thresh)
179 |             potential_bbox = []
180 |             # fig, ax = plt.subplots(1)
181 |             # ax.imshow(img)
182 |             for cnt in contours:
183 |                 bbox = cv2.boundingRect(cnt)
184 |                 thresh_for_figure = info['row_height'] * png_ratio*1.5#/ 2  modified on 0318
185 |                 if bbox[3] > thresh_for_figure and bbox[2] > thresh_for_figure:  # Important to set, FIg threshold
186 | 
187 |                     p_bbox = [int(float(x) / png_ratio) for x in bbox]
188 |                     # Format checking, to filter box that at top, down, left or right
189 |                     ol_left = overlap_ratio_based(p_bbox, info['left_bbox'])
190 |                     ol_right = overlap_ratio_based(p_bbox, info['right_bbox'])
191 |                     # Add filter for first page top sign 0110
192 |                     if page == 'page1.png':
193 |                         ol_top = overlap_ratio_based(p_bbox, [0, 0, info['page_width'],
194 |                                                               info['page_height'] / 4])  # First page box
195 |                     else:
196 |                         ol_top = overlap_ratio_based(p_bbox, info['top_bbox'])
197 | 
198 |                     ol_down = overlap_ratio_based(p_bbox, info['down_bbox'])
199 |                     ol_sum = 0
200 |                     ol_sum = ol_down + ol_left + ol_right + ol_top
201 |                     if ol_sum < 0.1:
202 |                         potential_bbox.append(p_bbox)
203 |                         # rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2],bbox[3],
204 |                         #                         linewidth=1, edgecolor='r',
205 |                         #                        facecolor='none')
206 |                         # ax.add_patch(rect)
207 | 
208 |             fig_box[page] = potential_bbox
209 | 
210 |             # To check if the pdf is mess up
211 |             if len(potential_bbox) > 1:
212 |                 obj_heights = np.array(potential_bbox)[:, 3]
213 |                 no_of_all = len(obj_heights)
214 |                 no_of_small = len([1 for obj_height in obj_heights if obj_height < 13 and obj_height > 4])
215 |                 small_percent = float(no_of_small) / no_of_all
216 |                 if no_of_all > 300 and small_percent > 0.8:
217 |                     info['mess_up'] = True
218 | 
219 |             count = 0
220 | 
221 |             if info['mess_up'] == False:# Need to set carefully
222 |                 while count < len(potential_bbox):# ###Need to think about it.....
223 |                     flag = 0
224 |                     for each_text_box in page_word_box:# Remove fig box that cross the text box
225 |                         overlap = overlap_ratio_based(potential_bbox[count], each_text_box)
226 |                         if overlap > 0.3:
227 |                             flag = 1
228 |                             del potential_bbox[count]
229 |                             break
230 |                     if flag == 0:
231 |                         count = count +1
232 |             else:
233 |                 while count < len(potential_bbox):# ###Need to think about it.....
234 |                     flag = 0
235 |                     if potential_bbox[count][3]>12:
236 |                         for each_text_box in page_word_box:# Remove fig box that cross the text box
237 |                             overlap = overlap_ratio_based(potential_bbox[count], each_text_box)
238 |                             if overlap > 0.1:
239 |                                 flag = 1
240 |                                 del potential_bbox[count]
241 |                                 break
242 |                         if flag == 0:
243 |                             count = count +1
244 |                     else:
245 |                         del potential_bbox[count]
246 | 
247 | 
248 |             fig_box[page] = potential_bbox
249 | 
250 |     info['fig_no_est'] = fig_no_estimation(cap_no_clue)
251 |     info['png_ratio'] = png_ratio
252 |     return cap_box, fig_box, info, table_box, page_word_box
253 | 
254 | def fig_no_estimation(fig_info):
255 |     #print fig_info
256 |     fig_no = 0
257 |     temp_max = 0
258 |     for clue in fig_info:
259 |         if re.search(r'\d+', clue) is not None:
260 |             temp_max = max(int(re.search(r'\d+', clue).group()), temp_max)
261 |     fig_no = temp_max
262 |     #print fig_no
263 |     return fig_no
264 | 
265 | def fig_cap_matching(cap_box, fig_box, info, table_box, text_box):
266 |     # cap_box
267 |     # fig_box
268 |     # info
269 |     figures = {}
270 |     captions = {}
271 |     fig_size_thresh = 30
272 |     for i in range(info['page_no']):
273 |         page = 'page' + str(i+1) + '.png'
274 |         table_caps = table_box[page]
275 | 
276 |         p_captions = cap_box[page]
277 |         p_figures = fig_box[page]
278 |         for table_cap in table_caps:# To remove the table
279 |             table_cap_box = [table_cap[0], table_cap[1]+table_cap[3], table_cap[2],4*info['row_height']]# Remove the table below
280 |             p_figure_id = 0
281 |             while p_figure_id < len(p_figures):
282 |                 p_figure = p_figures[p_figure_id]
283 | 
284 |                 overlap = overlap_ratio_based(table_cap_box, p_figure)
285 |                 if overlap >0.1:
286 |                     del p_figures[p_figure_id]
287 |                 else:
288 |                     p_figure_id = p_figure_id + 1
289 |             table_cap_box = [table_cap[0], table_cap[1]-4 * info['row_height'], table_cap[2],
290 |                              4 * info['row_height']]  # Remove the table below
291 |             p_figure_id = 0
292 |             while p_figure_id < len(p_figures):
293 |                 p_figure = p_figures[p_figure_id]
294 | 
295 |                 overlap = overlap_ratio_based(table_cap_box, p_figure)
296 |                 if overlap > 0.1:
297 |                     del p_figures[p_figure_id]
298 |                 else:
299 |                     p_figure_id = p_figure_id + 1
300 | 
301 | 
302 |         if len(p_figures) > 0 :
303 |             if len(p_figures) ==1 and len(p_captions) ==1:
304 |                 if p_figures[0][2] > fig_size_thresh and p_figures[0][3] > fig_size_thresh:# size
305 |                     if bbox_distance(p_figures[0], p_captions[0]) < 50: # distance
306 |                         figures[page] = [[p_figures[0], p_captions[0]]]
307 |                         captions[page] = [[p_captions[0], [1, 1, info['page_width']-2, info['page_height']-2]]]
308 |                 if page not in figures.keys():
309 |                     cap_regions = caption_regions(p_captions, p_figures, info)
310 |                     captions[page] = cap_regions
311 |                     figures[page] = label_subfig(info, p_figures, cap_regions, table_box)
312 | 
313 |             else:
314 |                 # sort captions by horizontal
315 |                 cap_regions = caption_regions(p_captions, p_figures, info)
316 | 
317 |                 # Calculate the overlap of figures and cpations, the figures
318 |                 # belong to the same caption should have same label
319 |                 #print cap_regions
320 |                 # For the figures have the same label, compute their bounding
321 |                 #  box
322 |                 captions[page] = cap_regions
323 |                 figures[page] = label_subfig(info, p_figures, cap_regions, table_box)
324 |             if len(p_captions) == 0:# No caption situation
325 |                 sum_area = 0
326 |                 for p_object in p_figures:
327 |                     sum_area = sum_area + p_object[2] * p_object[3]
328 | 
329 |                 page_width =info['page_width']-info['left_bbox'][2]-info['right_bbox'][2]
330 |                 page_height = info['page_height']-info['top_bbox'][3]-info['down_bbox'][3]
331 |                 if float(sum_area)/(page_width*page_height) > 0.2 and i>1:
332 |                     captions[page] = [[info['down_bbox'], [1, 1, info['page_width']-2, info['page_height']-2]]]
333 |                     figures[page] = label_subfig(info, p_figures, captions[page], table_box)
334 |         else:
335 |             captions[page] = []
336 |             figures[page] = []
337 | 
338 |     return figures, captions
339 | 
340 | def same_no_caps_est(cap_box, fig_box, info, table_box, text_box):
341 | 
342 |     cap_regions = {}
343 |     figures = {}
344 |     for page in cap_box:
345 |         cap_regions[page] = []
346 |         if len(cap_box[page]) == 1:
347 |             cap_regions[page].append([cap_box[page][0], [0, 0, info['page_width'], info['page_height']]])
348 |         if len(cap_box[page]) > 1:
349 |             p_figures = fig_box[page]
350 |             p_captions = cap_box[page]
351 |             cap_regions[page] = caption_regions(p_captions, p_figures, info)
352 |     # Calculate the overlap of figures and cpations, the figures
353 |     # belong to the same caption should have same label
354 |     #print cap_regions
355 |     # For the figures have the same label, compute their bounding
356 |     #  box
357 |     for page in cap_regions:
358 |         p_figures = fig_box[page]
359 |         p_cap_regions = cap_regions[page]
360 |         figures[page] = label_subfig(info, p_figures, p_cap_regions, table_box)
361 | 
362 |     return figures, cap_regions
363 | 
364 | def caption_regions(cap_box, fig_box, info):
365 |     # sort captions by horizontal
366 |     #print cap_box
367 |     #whole_page = [1, 1, info['page_width'], info['page_height']]
368 |     column_no = info['column_no']
369 |     columns = info['columns']
370 |     columns_point = [1] * column_no
371 |     cap_regions = []
372 |     if len(cap_box) == 1:
373 |         cap_regions.append([cap_box[0], [1, 1, info['page_width'] - 2*info['row_height'],info['page_height'] - 2*info['row_height']]])
374 |         # comment on 0318 for gxd
375 |         '''
376 |         if column_no == 1:
377 |             cap_regions.append([cap_box[0], [1, 1, info['page_width']-2, cap_box[0][1]]])
378 |             cap_regions.append([cap_box[0], [1, cap_box[0][1]+2*info['row_height'], info['page_width']-2, info['page_height']-cap_box[0][1]-3*info['row_height']]])
379 |         else:
380 |             if cap_box[0][2] > info['row_width'] + 50 or (cap_box[0][0] < info['page_width'] / 2 and
381 |                                                                 (cap_box[0][0] + cap_box[0][2]) > info['page_width'] / 2):
382 |                 cap_regions.append([cap_box[0], [1, 1, info['page_width'] - 2, cap_box[0][1]]])
383 |                 cap_regions.append([cap_box[0], [1, cap_box[0][1] + 2 * info['row_height'], info['page_width'] - 2,
384 |                                                  info['page_height'] - cap_box[0][1] - 3 * info['row_height']]])
385 |             else:
386 |                 if cap_box[0][0]< columns[0] + 100 or cap_box[0][0] < columns[0] + info['row_width'] -100:
387 |                     cap_regions.append([cap_box[0], [1, 1, columns[0] + info['row_width'], cap_box[0][1]]])
388 |                     cap_regions.append([cap_box[0], [1, cap_box[0][1] + 2 * info['row_height'], columns[0] + info['row_width'],
389 |                                                      info['page_height'] - cap_box[0][1] - 3 * info['row_height']]])
390 |                 else:
391 |                     cap_regions.append([cap_box[0], [min(cap_box[0][0], columns[0] + info['row_width']+50), 1, columns[0] + info['row_width'], cap_box[0][1]]])
392 |                     cap_regions.append([cap_box[0], [min(cap_box[0][0], columns[0] + info['row_width']+50), cap_box[0][1] + 2 * info['row_height'], columns[0] + info['row_width'],
393 |                                       info['page_height'] - cap_box[0][1] - 3 * info['row_height']]])
394 |         '''
395 |     elif len(cap_box) >1:
396 |         if column_no ==1:
397 |             cap_sorted = sorted(cap_box, key=lambda x: x[1])
398 |             for cap_item in cap_sorted:
399 |                 region = [1, columns_point[0], info['page_width']-2, cap_item[1] - columns_point[0]]
400 |                 cap_regions.append([cap_item, region])
401 |                 columns_point[0] = cap_item[1]+cap_item[3]
402 |             cap_regions.append([cap_item, [1, columns_point[0], info['page_width']-2, info['page_height'] - columns_point[0]]])
403 |         else:
404 |             cap_sorted = sorted(cap_box, key=lambda x: (x[1], x[0]))
405 |             # caption parallel
406 |             for cap_item in cap_sorted:
407 |                 no_cross_fig = 1
408 |                 if cap_item[2] > info['row_width']+50 or (cap_item[0] < info['page_width']/2 and
409 |                                                                   (cap_item[0]+cap_item[2])>info['page_width']/2):
410 |                     no_cross_fig = 0
411 |                     region = [1, max(columns_point), info['page_width']-2, cap_item[1] - max(columns_point)]
412 |                     columns_point = [cap_item[1]+cap_item[3]] * column_no
413 |                     cap_regions.append([cap_item, region])
414 |                 else:
415 |                     cap_y = cap_item[1]
416 |                     cap_x = cap_item[0]
417 |                     # for fig_item in fig_box:# To check if there are fig cross this caption
418 |                     #     if (fig_item[1] < cap_y) & (fig_item[1] + fig_item[3] > cap_y):
419 |                     #         no_cross_fig = 1
420 |                     # for other_cap in cap_sorted:# Caption parallel
421 |                     #     if (abs(other_cap[1] - cap_y)<info['row_height']) & (abs(other_cap[0] - cap_x) > 5* info['row_height']):
422 |                     #         no_cross_fig = 1
423 |                     # no_cross_fig = 1
424 |                     # if (cap_item[0] + cap_item[2] > columns[0]+ info['row_width']+100) and (cap_item[0] < columns[0]+ info['row_width'] - 50):
425 |                     #     no_cross_fig = 0
426 |                     #
427 |                     # if no_cross_fig == 0:
428 |                     #     region = [1, max(columns_point), info['page_width']-2, cap_y - max(columns_point)]
429 |                     #     columns_point = [cap_item[1] + cap_item[3]] * column_no
430 |                     #
431 |                     if no_cross_fig== 1:
432 |                         if cap_x < columns[0] + 100:
433 |                             region = [cap_x, columns_point[0], info['row_width'], cap_y - columns_point[0]]
434 |                             columns_point[0] = cap_y + cap_item[3]
435 |                         elif cap_x < columns[0] + info['row_width'] -100:
436 |                             region = [1, columns_point[0], columns[0] + info['row_width'], cap_y - columns_point[0]]
437 |                             columns_point[0] = cap_y + cap_item[3]
438 |                         else:
439 |                             region = [min(cap_x, columns[0] + info['row_width']+50), columns_point[1], info['page_width'] - min(cap_x, columns[0] + info['row_width']+50), cap_y - columns_point[1]]
440 |                             columns_point[1] = cap_y + cap_item[3]
441 | 
442 |                     cap_regions.append([cap_item, region])
443 |             # Added to cover all area, for image below captions
444 |             if no_cross_fig ==0:
445 |                 region = [1, max(columns_point), info['page_width'] - 2, info['page_height'] - max(columns_point)]
446 |                 cap_regions.append([cap_item, region])
447 |             else:
448 |                 cap_regions.append([cap_item, [0, columns_point[0], info['page_width']/2, info['page_height']-columns_point[0]-1]])
449 |                 cap_regions.append([cap_item, [info['page_width']/2, columns_point[1], info['page_width'] / 2,
450 |                                                info['page_height'] - columns_point[1] - 1]])
451 |     return cap_regions
452 | 
453 | 
454 | def label_subfig(info, figures, cap_regions, table_box):
455 |     # region overlap
456 |     # distance between all objects, thresh in 4 lines
457 |     # objects under table box
458 |     label = range(len(cap_regions))
459 |     labeled_figures = {}
460 |     fig_merged = []
461 |     for i in range(len(cap_regions)):
462 |         labeled_figures[str(i)] = []
463 | 
464 |     # Changed order, it may affect
465 |     for figure in figures:
466 |         for i in range(len(cap_regions)):
467 |             overlap = overlap_ratio_based(figure, cap_regions[i][1])
468 |             cover = overlap_ratio_based(cap_regions[i][0], figure)# to check if the caption in in the figure
469 |             if overlap > 0.2 and cover < 0.5:# The overlap need to set carefully
470 |                 labeled_figures[str(i)].append(figure)
471 | 
472 |         # check distance, to remove far objects
473 |         #if cap_regions[i][0][1] < info['down_bbox'][1]:
474 |         #    cap_box = [cap_regions[i][0]]
475 |         #    fig_objects = labeled_figures[str(i)]
476 |         #    for_tr_graph = [0]*len(fig_objects)
477 |         #    increase = -1
478 |         #    while increase != 0:
479 |         #        increase = 0
480 |         #        for fig_no in range(len(fig_objects)):
481 |         #            if for_tr_graph[fig_no]==0:
482 |         #                for cap in cap_box:
483 |         #                    dis = bbox_distance(fig_objects[fig_no], cap)
484 |         #                    if dis < 6 * info['row_height']:
485 |         #                        cap_box.append(fig_objects[fig_no])
486 |         #                        for_tr_graph[fig_no] = 1
487 |         #                        increase = increase +1
488 |         #                        break
489 |         #    del cap_box[0]
490 |         #    labeled_figures[str(i)]= cap_box
491 | 
492 | 
493 | 
494 |     for i in range(len(cap_regions)):
495 |         if len(labeled_figures[str(i)]) > 0:
496 |             if len(labeled_figures[str(i)]) < 2:
497 |                 if labeled_figures[str(i)][0][2] > 20 and labeled_figures[str(i)][0][2] > 20:# Fig Thresh
498 |                     fig_merged.append([labeled_figures[str(i)][0], cap_regions[i][0]])
499 |             else:
500 |                 x0 = []
501 |                 x1 = []
502 |                 y0 = []
503 |                 y1 = []
504 |                 sum_figure_area = 0
505 |                 for each_figure in labeled_figures[str(i)]:
506 |                     x0.append(each_figure[0])
507 |                     y0.append(each_figure[1])
508 |                     x1.append(each_figure[0] + each_figure[2])
509 |                     y1.append(each_figure[1] + each_figure[3])
510 |                     sum_figure_area = each_figure[2] * each_figure[3] + sum_figure_area
511 |                 new_fig = [min(x0), min(y0), max(x1)-min(x0),
512 |                                           max(y1)-min(y0)]
513 |                 # if new_fig[2] > 2*info['row_height'] and new_fig[3] > 2*info['row_height']:
514 |                 #     fig_merged.append(new_fig)
515 | 
516 |                 if new_fig[2] > 20 and new_fig[3] > 20:# Fig Threshold
517 |                     #Check overlap ratio
518 |                     overlap_fig = float(sum_figure_area)/(new_fig[2]*new_fig[3])
519 |                     if overlap_fig > 0.1:
520 |                         fig_merged.append([new_fig, cap_regions[i][0]])
521 |     # fileter small one
522 | 
523 |     return fig_merged
524 | 
525 | def evaluation(prefigures, cap_regions, html_file_path, info, html_boxes):
526 | 
527 |     fig_cap_pair = prefigures
528 |     figures = {}
529 |     captions = {}
530 |     for page in fig_cap_pair:
531 |         figures[page] = []
532 |         captions[page] = []
533 |         for each_figcap in fig_cap_pair[page]:
534 |             new_fig = each_figcap[0]
535 |             caption_flag = overlap_ratio_based(info['down_bbox'], each_figcap[1])
536 |             if caption_flag > 0.8:
537 |                 figcap = each_figcap[0]
538 |                 if info['mess_up'] == False:
539 |                     for each_page_html in html_boxes:
540 |                         if each_page_html[0] == int(page[4:-4]):
541 |                             for element in each_page_html[1]:
542 |                                 in_or_not = overlap_ratio_based(element[0], figcap)
543 |                                 if in_or_not > 0.05:
544 |                                     new_fig = merge_two_boxes(new_fig, element[0])
545 | 
546 |                             for element in each_page_html[1]:
547 |                                 in_or_not = bbox_distance(element[0], each_figcap[0])
548 |                                 if in_or_not < info['row_height']/4:
549 |                                     new_fig = merge_two_boxes(new_fig, element[0])
550 |                     figures[page].append([new_fig, []])
551 |                     captions[page].append([])
552 |                 else:
553 |                     figures[page].append([each_figcap[0], []])
554 |                     captions[page].append([])
555 |             else:
556 | 
557 |                 x0 = min(each_figcap[0][0],each_figcap[1][0])
558 |                 y0 = min(each_figcap[0][1], each_figcap[1][1])
559 |                 x1 = max(each_figcap[0][0]+each_figcap[0][2], each_figcap[1][0]+each_figcap[1][2])
560 |                 y1 = max(each_figcap[0][1] + each_figcap[0][3], each_figcap[1][1])
561 |                 figcap = [x0, y0, x1-x0, y1-y0]
562 |                 cap_box = each_figcap[1]
563 | 
564 | 
565 |                 #print fig_cap_pair[page]
566 |                 if info['mess_up'] == False:
567 |                     for each_page_html in html_boxes:
568 |                         if each_page_html[0] == int(page[4:-4]):
569 | 
570 |                             for element in each_page_html[1]:
571 |                                 in_or_not = overlap_ratio_based(element[0], figcap)
572 |                                 if in_or_not > 0.05:
573 |                                     new_fig = merge_two_boxes(new_fig,element[0])
574 | 
575 |                             for element in each_page_html[1]:
576 |                                 in_or_not = bbox_distance(element[0], each_figcap[0])
577 |                                 if in_or_not < info['row_height']/4:
578 |                                     new_fig = merge_two_boxes(new_fig, element[0])
579 |                             # for caption detection ~~~~~~~~~~~~~~~~~~~~~~~~
580 |                             cap_detection_flag = 0
581 |                             cap_text = []
582 |                             cap_gap = 0.5 * info['row_height']# modify to 0.75 0.5
583 |                             for element in each_page_html[1]:
584 |                                 if element[0] == cap_box or cap_detection_flag == 1:
585 |                                     if element[0] == cap_box:
586 |                                         cap_detection_flag = 1
587 |                                         cap_text.append(element[1])
588 |                                         first_line_box = cap_box
589 |                                         moving_box = cap_box
590 |                                     else:
591 |                                         cap_gap = max(min(element[0][1]-first_line_box[1]-first_line_box[3], cap_gap), 3)
592 |                                         current_gap = element[0][1]-moving_box[1]-moving_box[3]
593 |                                         #print current_gap
594 |                                         #print moving_box
595 |                                         #print element[0]
596 |                                         if current_gap>=max(0.5 * info['row_height'],cap_gap):# 0.75*info['row_height']
597 |                                             cap_detection_flag = 0
598 |                                         elif (element[0][2]-first_line_box[2]>5*info['row_height'] or element[0][3] - first_line_box[3]>1) and current_gap-cap_gap>3:
599 |                                             cap_detection_flag = 0
600 | 
601 |                                         if abs(first_line_box[0]-element[0][0])>10*info['row_height'] and cap_detection_flag == 0:
602 |                                             cap_detection_flag = 1
603 |                                         elif abs(first_line_box[0]-element[0][0])>10*info['row_height'] and cap_detection_flag == 1:
604 |                                             cap_detection_flag = 1
605 | 
606 |                                         elif cap_detection_flag == 1:
607 |                                             moving_box = element[0]
608 |                                             cap_box = merge_two_boxes(cap_box, element[0])
609 |                                             cap_text.append(element[1])
610 |                                             
611 |                                     # To determine where to stop
612 |                     # Finding separate captions
613 |                             if len(cap_text)==1 and (cap_text[0][-1].isdigit() or cap_text[-1][-1].isdigit()) and len(cap_text[0])<15:
614 |                                 cap_detection_flag = 0
615 |                                 cap_text_cp = cap_text
616 |                                 cap_box_cp = cap_box
617 |                                 cap_text = []
618 |                                 cap_gap = 0.5 * info['row_height']  # modify to 0.75 0.5
619 |                                 next = 0
620 |                                 for element in each_page_html[1]:
621 |                                     if element[0] == cap_box or cap_detection_flag == 1:
622 |                                         if next == 0:
623 |                                             if element[0][1]>cap_box[1] and len(element[1])>30:
624 |                                                 next = 1
625 |                                                 cap_detection_flag = 1
626 |                                                 cap_text.append(element[1])
627 |                                                 first_line_box = element[0]
628 |                                                 moving_box = element[0]
629 |                                                 cap_box = element[0]
630 |                                             else:
631 |                                                 cap_detection_flag = 1
632 | 
633 |                                         else:
634 |                                             cap_gap = max(
635 |                                                 min(element[0][1] - first_line_box[1] - first_line_box[3], cap_gap), 3)
636 |                                             current_gap = element[0][1] - moving_box[1] - moving_box[3]
637 |                                             #print current_gap
638 |                                             #print moving_box
639 |                                             #print element[0]
640 |                                             if current_gap >= max(0.5 * info['row_height'],cap_gap):  # 0.75*info['row_height']
641 |                                                 cap_detection_flag = 0
642 |                                             elif (element[0][2] - first_line_box[2] > 5 * info['row_height'] or
643 |                                                               element[0][3] - first_line_box[3] > 1) and current_gap - cap_gap > 3:
644 |                                                 cap_detection_flag = 0
645 | 
646 |                                             if abs(first_line_box[0] - element[0][0]) > 10 * info[
647 |                                                 'row_height'] and cap_detection_flag == 0:
648 |                                                 cap_detection_flag = 1
649 |                                             elif abs(first_line_box[0] - element[0][0]) > 10 * info[
650 |                                                 'row_height'] and cap_detection_flag == 1:
651 |                                                 cap_detection_flag = 1
652 | 
653 |                                             elif cap_detection_flag == 1:
654 |                                                 moving_box = element[0]
655 |                                                 cap_box = merge_two_boxes(cap_box, element[0])
656 |                                                 cap_text.append(element[1])
657 |                                                 
658 |                                 distance_before = bbox_distance(new_fig, cap_box_cp)
659 |                                 distance_now = bbox_distance(new_fig, cap_box)
660 |                                 # if distance_now > 2*distance_before + 2*cap_box_cp[3]: No distance control is better
661 |                                 #     cap_box = cap_box_cp
662 |                                 #     cap_text = cap_text_cp
663 | 
664 |                             figures[page].append([new_fig, [cap_box, cap_text]])
665 | 
666 |                             captions[page].append([cap_box, cap_text])
667 |                 else:
668 | 
669 |                     for each_page_html in html_boxes:
670 |                         if each_page_html[0] == int(page[4:-4]):
671 |                             cap_detection_flag = 0
672 |                             cap_text = []
673 |                             cap_gap = info['row_height']
674 |                             for element in each_page_html[1]:
675 |                                 if element[0] == cap_box or cap_detection_flag == 1:
676 |                                     if element[0] == cap_box:
677 |                                         cap_detection_flag = 1
678 |                                         cap_text.append(element[1])
679 |                                         first_line_box = cap_box
680 |                                         moving_box = cap_box
681 |                                     else:
682 |                                         cap_gap = max(min(element[0][1]-first_line_box[1]-first_line_box[3], cap_gap), 3)
683 |                                         current_gap = element[0][1]-moving_box[1]-moving_box[3]
684 |                                         #print current_gap
685 |                                         #print moving_box
686 |                                         #print element[0]
687 |                                         if current_gap>=max(0.5 * info['row_height'],cap_gap):# 0.75*info['row_height']
688 |                                             cap_detection_flag = 0
689 |                                         elif (element[0][2]-first_line_box[2]>5*info['row_height'] or element[0][3] - first_line_box[3]>1) and current_gap-cap_gap>3:
690 |                                             cap_detection_flag = 0
691 | 
692 |                                         if abs(first_line_box[0]-element[0][0])>10*info['row_height'] and cap_detection_flag == 0:
693 |                                             cap_detection_flag = 1
694 |                                         elif abs(first_line_box[0]-element[0][0])>10*info['row_height'] and cap_detection_flag == 1:
695 |                                             cap_detection_flag = 1
696 | 
697 |                                         elif cap_detection_flag == 1:
698 |                                             moving_box = element[0]
699 |                                             cap_box = merge_two_boxes(cap_box, element[0])
700 |                                             cap_text.append(element[1])
701 |                                             if first_line_box[2]-element[0][2]>5*info['row_height'] and element[1].endswith('.'):
702 |                                                 cap_detection_flag = 0
703 |                             captions[page].append([cap_box, cap_text])
704 |                             figures[page].append([each_figcap[0], [cap_box, cap_text]])
705 | 
706 |     #
707 |     # for page in figures:
708 |     #     if len(figures[page])>0:
709 |     #         img = cv2.imread(html_file_path + '/' + page)
710 |     #         img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
711 |     #         png_size = img.shape
712 |     #         if png_size[0] > png_size[1]:
713 |     #             png_ratio = float(png_size[0]) / info['page_height']
714 |     #         else:
715 |     #             png_ratio = float(png_size[0]) / info['page_width']
716 |     #         bbox_no = 0
717 |     #         while bbox_no < len(figures[page]):
718 |     #             each_bbox = figures[page][bbox_no]
719 |     #             each_figure = img[int(each_bbox[1]*png_ratio):int((each_bbox[3]+each_bbox[1])*png_ratio),
720 |     #                           int(each_bbox[0] * png_ratio):int((each_bbox[2]+each_bbox[0]) * png_ratio)]
721 |     #             each_figure = cv2.resize(each_figure, (200, 200))
722 |     #             laplacian = cv2.Laplacian(each_figure, cv2.CV_64F)
723 |     #             sobelx = cv2.Sobel(each_figure, cv2.CV_64F, 1, 0, ksize=5)
724 |     #             sobely = cv2.Sobel(each_figure, cv2.CV_64F, 0, 1, ksize=5)
725 |     #             img_complexity = entropy(sobelx) + entropy(sobely)
726 |     #             print img_complexity
727 |     #             if img_complexity > 0.5: ##### need to set carefully
728 |     #                 bbox_no = bbox_no + 1
729 |     #             else:
730 |     #                 del figures[page][bbox_no]
731 | 
732 | 
733 |     return figures, captions
734 | 
735 | def check_region(info, figures, captions):
736 |     final_figures = figures
737 |     final_captions = captions
738 |     for page in figures:
739 |         for each_figure in figures[page]:
740 |             if len(each_figure[1])>0:
741 |                 caption_overlap_ratio = overlap_ratio_based(each_figure[1][0], each_figure[0])
742 | 
743 |                 if (each_figure[1][0][0]+each_figure[1][0][2]) > info['right_bbox'][0]:
744 |                     each_figure[1][0][2] = info['right_bbox'][0]- each_figure[1][0][0]
745 |                 # for two column documents
746 |                 if each_figure[0][2] > 1.5 *info['row_width'] and each_figure[1][0][1] > each_figure[0][1] + each_figure[0][3] \
747 |                     and each_figure[1][0][0] + each_figure[1][0][2]< each_figure[0][0]+ each_figure[0][2]/2 \
748 |                     and each_figure[1][0][3] >3*info['row_height']:
749 |                     each_figure[1][0][2] = 2*each_figure[1][0][2] + 2*info['row_height']
750 | 
751 |                 if caption_overlap_ratio > 0.8:
752 |                 # spliting caption box and the figure box
753 |                 # top caption
754 |                     if each_figure[1][0][1]>=each_figure[0][1] and (each_figure[1][0][1]-each_figure[0][1])<2*info['row_height']\
755 |                         and each_figure[1][0][0]<each_figure[0][0]+each_figure[0][2]/2 and each_figure[1][0][0] +each_figure[1][0][2]>each_figure[0][0]+each_figure[0][2]/2 \
756 |                         and each_figure[0][1]+each_figure[0][3]-each_figure[1][0][1]-each_figure[1][0][3] > 5*info['row_height']:
757 |                         each_figure[0] = [each_figure[0][0], each_figure[1][0][1]+each_figure[1][0][3],
758 |                                           each_figure[0][2], each_figure[0][1]+each_figure[0][3]-each_figure[1][0][1]-each_figure[1][0][3]]
759 |                 # down caption
760 |                     elif each_figure[0][1]+each_figure[0][3]>=each_figure[1][0][1]+each_figure[1][0][3] and (each_figure[0][1]+each_figure[0][3]-each_figure[1][0][1]-each_figure[1][0][3]) < 2 * info['row_height'] \
761 |                             and each_figure[1][0][0] < each_figure[0][0] + each_figure[0][2] / 2 and \
762 |                             each_figure[1][0][0] + each_figure[1][0][2] > each_figure[0][0] + each_figure[0][2] / 2 \
763 |                             and each_figure[0][1] + each_figure[0][3] - each_figure[1][0][1] > 5 * info['row_height']:
764 |                         each_figure[0] = [each_figure[0][0], each_figure[0][1],
765 |                                               each_figure[0][2],
766 |                                               each_figure[0][1] + each_figure[0][3] - each_figure[1][0][1]]
767 |                 # right caption
768 |                     elif each_figure[1][0][0]+ each_figure[1][0][2]<= each_figure[0][0] + each_figure[0][2] and (each_figure[0][0] + each_figure[0][2] - each_figure[1][0][0]- each_figure[1][0][2]) < 2 * info['row_height'] \
769 |                              and each_figure[1][0][0] > each_figure[0][0] + each_figure[0][2]/2 \
770 |                             and each_figure[1][0][0] - each_figure[0][0] > 5 * info['row_height']:
771 |                         each_figure[0] = [each_figure[0][0], each_figure[0][1],
772 |                                           each_figure[1][0][0] - each_figure[0][0],
773 |                                           each_figure[0][3]]
774 |                 # left caption
775 |                     elif each_figure[1][0][0] >= each_figure[0][0] and (each_figure[1][0][0] - each_figure[0][0]) < 2 *info['row_height'] \
776 |                         and each_figure[1][0][0] + each_figure[1][0][2]< each_figure[0][0] + each_figure[0][2] / 2\
777 |                         and each_figure[0][0]+each_figure[0][2] - each_figure[1][0][0] > 5 * info['row_height']:
778 |                         each_figure[0] = [each_figure[1][0][0], each_figure[0][1],
779 |                                       each_figure[0][0]+each_figure[0][2] - each_figure[1][0][0],
780 |                                       each_figure[0][3]]
781 | 
782 |     return figures, captions
783 | 
784 | def merge_boxes(figures, cap_regions, table_box, info):
785 |     # region overlap
786 |     # distance between all objects, thresh in 4 lines
787 |     # objects under table box
788 |     label = [-1]*len(figures)
789 |     fig_merged = []
790 | 
791 |     for j in range(len(figures)):
792 |         figure = figures[j]
793 |         for i in range(len(cap_regions)):
794 |             overlap = overlap_ratio_based(figure, cap_regions[i][1])
795 |             if overlap > 0.5:
796 |                 label[j]=i
797 |     for i in range(len(cap_regions)):
798 |         index = [no for no in range(len(label)) if label[no] ==i]
799 |         check_box = figures[index]
800 |         dis_matrix = np.zeros(shape=(len(check_box),len(check_box)))
801 |         for j in range(len(check_box)):
802 |             for k in range(len(check_box)):
803 |                 if j ==k:
804 |                     dis_matrix[j][k] = 10*info['row_height']
805 |                 else:
806 |                     dis_matrix[j][k] = manhattan_dist(check_box[j], check_box[k])
807 |         dis_matrix = min(dis_matrix)
808 | 
809 |     #
810 |     #
811 |     # for i in range(len(cap_regions)):
812 |     #     if len(labeled_figures[str(i)]) > 0:
813 |     #         if len(labeled_figures[str(i)]) < 2:
814 |     #             fig_merged.append(labeled_figures[str(i)][0])
815 |     #         else:
816 |     #             x0 = []
817 |     #             x1 = []
818 |     #             y0 = []
819 |     #             y1 = []
820 |     #             for each_figure in labeled_figures[str(i)]:
821 |     #                 x0.append(each_figure[0])
822 |     #                 y0.append(each_figure[1])
823 |     #                 x1.append(each_figure[0] + each_figure[2])
824 |     #                 y1.append(each_figure[1] + each_figure[3])
825 |     #
826 |     #             new_fig = [min(x0), min(y0), max(x1)-min(x0),
827 |     #                                       max(y1)-min(y0)]
828 |     #             fig_merged.append(new_fig)
829 |     #
830 |     # return fig_merged
831 | 
832 | 
833 | def overlap_ratio_based(box1, box2):
834 |     # overlap ratio based on box1
835 |     box1_x0 = box1[0]
836 |     box1_y0 = box1[1]
837 |     box1_x1 = box1[0] + box1[2]
838 |     box1_y1 = box1[1] + box1[3]
839 | 
840 |     box2_x0 = box2[0]
841 |     box2_y0 = box2[1]
842 |     box2_x1 = box2[0] + box2[2]
843 |     box2_y1 = box2[1] + box2[3]
844 | 
845 |     SI = max(0, min(box1_x1, box2_x1) - max(box1_x0, box2_x0)) * \
846 |          max(0, min(box1_y1, box2_y1) - max(box1_y0, box2_y0))
847 |     box1_area = box1[2] * box1[3]
848 |     box2_area = box2[2] * box2[3]
849 |     SU = box1_area + box2_area - SI
850 |     if box1_area == 0:
851 |         overlap_ratio = 0
852 |     else:
853 |         overlap_ratio = float(SI) / box1_area
854 |     return overlap_ratio
855 | 
856 | def bbox_distance(bbox1, bbox2):
857 |     x1 = bbox1[0]
858 |     y1 = bbox1[1]
859 |     x1b = bbox1[0] + bbox1[2]
860 |     y1b = bbox1[1] + bbox1[3]
861 |     x2 = bbox2[0]
862 |     y2 = bbox2[1]
863 |     x2b = bbox2[0] + bbox2[2]
864 |     y2b = bbox2[1] + bbox2[3]
865 |     left = x2b < x1
866 |     right = x1b < x2
867 |     bottom = y2b < y1
868 |     top = y1b < y2
869 |     if top and left:
870 |         return manhattan_dist((x1, y1b), (x2b, y2))
871 |     elif left and bottom:
872 |         return manhattan_dist((x1, y1), (x2b, y2b))
873 |     elif bottom and right:
874 |         return manhattan_dist((x1b, y1), (x2, y2b))
875 |     elif right and top:
876 |         return manhattan_dist((x1b, y1b), (x2, y2))
877 |     elif left:
878 |         return x1 - x2b
879 |     elif right:
880 |         return x2 - x1b
881 |     elif bottom:
882 |         return y1 - y2b
883 |     elif top:
884 |         return y2 - y1b
885 |     else:  # rectangles intersect
886 |         return 0
887 | 
888 | def manhattan_dist(a, b):
889 |     return sum(abs(a-b) for a,b in zip(a, b))
890 | 
891 | def merge_two_boxes(bbox1, bbox2):
892 |     x0 = min(bbox1[0], bbox2[0])
893 |     y0 = min(bbox1[1], bbox2[1])
894 |     x1 = max(bbox1[0]+bbox1[2], bbox2[0]+bbox2[2])
895 |     y1 = max(bbox1[1] + bbox1[3], bbox2[1] + bbox2[3])
896 |     return [x0, y0, x1-x0, y1-y0]
897 | 


--------------------------------------------------------------------------------