├── tests ├── bench.odt ├── run_dumps.sh ├── memory.py ├── dump_file.py ├── dump_file_pdfminer.py └── dump_file_gi_poppler.py ├── pdfparser ├── .gitignore ├── __init__.py └── poppler.pyx ├── test_docs ├── test1.odt └── test1.pdf ├── .github └── workflows │ └── python-package.yml ├── .gitignore ├── setup.py └── README.md /tests/bench.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/izderadicka/pdfparser/HEAD/tests/bench.odt -------------------------------------------------------------------------------- /pdfparser/.gitignore: -------------------------------------------------------------------------------- 1 | /__init__.pyc 2 | /libpoppler.so.58 3 | /poppler.cpp 4 | /poppler.so 5 | -------------------------------------------------------------------------------- /pdfparser/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.3' 2 | 3 | def version(): 4 | return __version__ -------------------------------------------------------------------------------- /test_docs/test1.odt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/izderadicka/pdfparser/HEAD/test_docs/test1.odt -------------------------------------------------------------------------------- /test_docs/test1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/izderadicka/pdfparser/HEAD/test_docs/test1.pdf -------------------------------------------------------------------------------- /tests/run_dumps.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | echo 'PDF parsing with libpoppler' 3 | time python -d dump_file.py test_docs/test1.pdf > /dev/null 4 | echo 'PDF parsing with pdfminer' 5 | time python tests/dump_file_pdfminer.py test_docs/test1.pdf > /dev/null 6 | -------------------------------------------------------------------------------- /tests/memory.py: -------------------------------------------------------------------------------- 1 | import pdfparser.poppler as pdf 2 | import sys 3 | import os.path 4 | import psutil 5 | 6 | def get_mem(): 7 | p=psutil.Process() 8 | m=p.memory_info() 9 | return m.vms 10 | test_doc=os.path.join(os.path.dirname(__file__), '../test_docs/test1.pdf') 11 | 12 | 13 | start_mem=get_mem() 14 | for counti in range(10000): 15 | imem=get_mem() 16 | 17 | d=pdf.Document(test_doc) 18 | 19 | pages=d.no_of_pages 20 | for p in d: 21 | pg_info= p.page_no, p.size 22 | for f in p: 23 | for b in f: 24 | bbox= b.bbox.as_tuple() 25 | for l in b: 26 | line_info = l.text.encode('UTF-8'), l.bbox.as_tuple() 27 | #assert l.char_fonts.comp_ratio < 1.0 28 | for i in range(len(l.text)): 29 | char_info = l.text[i].encode('UTF-8'), l.char_bboxes[i].as_tuple(), \ 30 | l.char_fonts[i].name, l.char_fonts[i].size, l.char_fonts[i].color, 31 | incr=(get_mem()-imem) 32 | if incr>0: 33 | print 'Iter no. %d'%counti, 34 | print 'Memory: %d' % get_mem(), 'Increase %d' % (get_mem()-imem) 35 | 36 | print "Final memory increase %d" % ( get_mem() - start_mem) -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install dependencies, run tests with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python build and test 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["2.7","3.6","3.7", "3.8", "3.9"] 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v2 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | # python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | sudo apt-get update 33 | sudo apt-get install -y libtool pkg-config gettext fontconfig libfontconfig1-dev cmake libzip-dev libjpeg-dev 34 | git config --global http.sslVerify "false" 35 | git clone --branch poppler-0.62.0 --depth 1 https://anongit.freedesktop.org/git/poppler/poppler.git poppler_src 36 | cd poppler_src/ 37 | cmake -DENABLE_SPLASH=OFF -DENABLE_UTILS=OFF -DENABLE_LIBOPENJPEG=none . 38 | make 39 | cp libpoppler.so.?? ../pdfparser/ 40 | cp cpp/libpoppler-cpp.so.? ../pdfparser 41 | cd .. 42 | pip install cython 43 | POPPLER_ROOT=poppler_src python setup.py build_ext --inplace 44 | - name: Test on one document 45 | run: | 46 | cd tests 47 | PYTHONPATH=.. python dump_file.py ../test_docs/test1.pdf | grep zasahovat 48 | -------------------------------------------------------------------------------- /tests/dump_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | 4 | import pdfparser.poppler as pdf 5 | import argparse 6 | import sys 7 | 8 | p=argparse.ArgumentParser() 9 | p.add_argument('document', help='Document file') 10 | p.add_argument('--char-details', action='store_true', help='print character details') 11 | p.add_argument('-f', '--first-page', type=int, help='first page') 12 | p.add_argument('-l', '--last-page', type=int, help='first page') 13 | p.add_argument('--phys-layout', action='store_true', help='Physical Layout - param for text analysis') 14 | p.add_argument('--fixed-pitch', type=float, default=0.0, help='Fixed pitch - param for text analysis - app. max space size') 15 | p.add_argument('-q', '--quiet', action='store_true', help='Silence all output from poppler') 16 | args=p.parse_args() 17 | file_name = args.document if sys.version_info[0] <= 2 else bytes(args.document, "utf-8") 18 | d=pdf.Document(file_name, args.phys_layout, args.fixed_pitch, args.quiet) # @UndefinedVariable 19 | fp=args.first_page or 1 20 | lp=args.last_page or d.no_of_pages 21 | print('No of pages', d.no_of_pages) 22 | for p in d: 23 | if p.page_no< fp or p.page_no>lp: 24 | continue 25 | print ('Page', p.page_no, 'size =', p.size) 26 | for f in p: 27 | print (' '*1,'Flow') 28 | for b in f: 29 | print (' '*2,'Block', 'bbox=', b.bbox.as_tuple()) 30 | for l in b: 31 | print (' '*3, l.text.encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% l.bbox.as_tuple()) 32 | #assert l.char_fonts.comp_ratio < 1.0 33 | if args.char_details: 34 | for i in range(len(l.text)): 35 | print (l.text[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% l.char_bboxes[i].as_tuple(),\ 36 | l.char_fonts[i].name, l.char_fonts[i].size, l.char_fonts[i].color, ) 37 | print() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /poppler_src 2 | /.project 3 | /.pydevproject 4 | /.externalToolBuilders/ 5 | /build/ 6 | /pdfparser/libpoppler* 7 | 8 | ### Linux ### 9 | *~ 10 | 11 | # temporary files which can be created if a process still has a handle open of a deleted file 12 | .fuse_hidden* 13 | 14 | # KDE directory preferences 15 | .directory 16 | 17 | # Linux trash folder which might appear on any partition or disk 18 | .Trash-* 19 | 20 | 21 | ### Python ### 22 | # Byte-compiled / optimized / DLL files 23 | __pycache__/ 24 | *.py[cod] 25 | *$py.class 26 | 27 | # C extensions 28 | *.so 29 | 30 | # Distribution / packaging 31 | .Python 32 | env/ 33 | build/ 34 | develop-eggs/ 35 | dist/ 36 | downloads/ 37 | eggs/ 38 | .eggs/ 39 | lib/ 40 | lib64/ 41 | parts/ 42 | sdist/ 43 | var/ 44 | *.egg-info/ 45 | .installed.cfg 46 | *.egg 47 | 48 | # PyInstaller 49 | # Usually these files are written by a python script from a template 50 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 51 | *.manifest 52 | *.spec 53 | 54 | # Installer logs 55 | pip-log.txt 56 | pip-delete-this-directory.txt 57 | 58 | # Unit test / coverage reports 59 | htmlcov/ 60 | .tox/ 61 | .coverage 62 | .coverage.* 63 | .cache 64 | nosetests.xml 65 | coverage.xml 66 | *,cover 67 | .hypothesis/ 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | 77 | # Flask instance folder 78 | instance/ 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # IPython Notebook 87 | .ipynb_checkpoints 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pycharm 93 | .idea 94 | 95 | ### C++ ### 96 | # Compiled Object files 97 | *.slo 98 | *.lo 99 | *.o 100 | *.obj 101 | 102 | # Precompiled Headers 103 | *.gch 104 | *.pch 105 | 106 | # Compiled Dynamic libraries 107 | *.so 108 | *.dylib 109 | *.dll 110 | 111 | # Fortran module files 112 | *.mod 113 | 114 | # Compiled Static libraries 115 | *.lai 116 | *.la 117 | *.a 118 | *.lib 119 | 120 | # Executables 121 | *.exe 122 | *.out 123 | *.app 124 | .~lock* 125 | -------------------------------------------------------------------------------- /tests/dump_file_pdfminer.py: -------------------------------------------------------------------------------- 1 | from pdfminer.pdfparser import PDFParser 2 | from pdfminer.pdfpage import PDFPage 3 | from pdfminer.pdfdocument import PDFDocument 4 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 5 | from pdfminer.converter import PDFPageAggregator 6 | from pdfminer.layout import LAParams, LTTextBox,LTChar, LTFigure 7 | import sys 8 | 9 | class PdfMinerWrapper(object): 10 | """ 11 | Usage: 12 | with PdfMinerWrapper('2009t.pdf') as doc: 13 | for page in doc.get_pages(): 14 | """ 15 | def __init__(self, pdf_doc, pdf_pwd=""): 16 | self.pdf_doc = pdf_doc 17 | self.pdf_pwd = pdf_pwd 18 | def __enter__(self): 19 | #open the pdf file 20 | self.fp = open(self.pdf_doc, 'rb') 21 | # create a parser object associated with the file object 22 | parser = PDFParser(self.fp) 23 | # create a PDFDocument object that stores the document structure 24 | doc = PDFDocument(parser, password=self.pdf_pwd) 25 | # connect the parser and document objects 26 | parser.set_document(doc) 27 | self.doc=doc 28 | return self 29 | 30 | def _parse_pages(self): 31 | rsrcmgr = PDFResourceManager() 32 | laparams = LAParams(char_margin=3.5, all_texts = True) 33 | device = PDFPageAggregator(rsrcmgr, laparams=laparams) 34 | interpreter = PDFPageInterpreter(rsrcmgr, device) 35 | 36 | for page in PDFPage.create_pages(self.doc): 37 | interpreter.process_page(page) 38 | # receive the LTPage object for this page 39 | layout = device.get_result() 40 | # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc. 41 | yield layout 42 | def __iter__(self): 43 | return iter(self._parse_pages()) 44 | 45 | def __exit__(self, _type, value, traceback): 46 | self.fp.close() 47 | 48 | 49 | def main(): 50 | with PdfMinerWrapper(sys.argv[1]) as doc: 51 | for page in doc: 52 | print 'Page no.', page.pageid, 'Size', (page.height, page.width) 53 | for tbox in page: 54 | if not isinstance(tbox, LTTextBox): 55 | continue 56 | print ' '*1, 'Block', 'bbox=(%0.2f, %0.2f, %0.2f, %0.2f)'% tbox.bbox 57 | for obj in tbox: 58 | print ' '*2, obj.get_text().encode('UTF-8')[:-1], '(%0.2f, %0.2f, %0.2f, %0.2f)'% tbox.bbox 59 | for c in obj: 60 | if not isinstance(c, LTChar): 61 | continue 62 | print c.get_text().encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% c.bbox, c.fontname, c.size, 63 | print 64 | 65 | 66 | 67 | if __name__=='__main__': 68 | main() -------------------------------------------------------------------------------- /tests/dump_file_gi_poppler.py: -------------------------------------------------------------------------------- 1 | from gi.repository import Poppler, GLib 2 | import ctypes 3 | import sys 4 | import os.path 5 | lib_poppler = ctypes.cdll.LoadLibrary("libpoppler-glib.so.8") 6 | 7 | ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p 8 | ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [ctypes.py_object, ctypes.c_char_p] 9 | PyCapsule_GetPointer = ctypes.pythonapi.PyCapsule_GetPointer 10 | 11 | class Poppler_Rectangle(ctypes.Structure): 12 | _fields_ = [ ("x1", ctypes.c_double), ("y1", ctypes.c_double), ("x2", ctypes.c_double), ("y2", ctypes.c_double) ] 13 | LP_Poppler_Rectangle = ctypes.POINTER(Poppler_Rectangle) 14 | poppler_page_get_text_layout = ctypes.CFUNCTYPE(ctypes.c_int, 15 | ctypes.c_void_p, 16 | ctypes.POINTER(LP_Poppler_Rectangle), 17 | ctypes.POINTER(ctypes.c_uint) 18 | )(lib_poppler.poppler_page_get_text_layout) 19 | 20 | def get_page_layout(page): 21 | assert isinstance(page, Poppler.Page) 22 | capsule = page.__gpointer__ 23 | page_addr = PyCapsule_GetPointer(capsule, None) 24 | rectangles = LP_Poppler_Rectangle() 25 | n_rectangles = ctypes.c_uint(0) 26 | has_text = poppler_page_get_text_layout(page_addr, ctypes.byref(rectangles), ctypes.byref(n_rectangles)) 27 | try: 28 | result = [] 29 | if has_text: 30 | assert n_rectangles.value > 0, "n_rectangles.value > 0: {}".format(n_rectangles.value) 31 | assert rectangles, "rectangles: {}".format(rectangles) 32 | for i in range(n_rectangles.value): 33 | r = rectangles[i] 34 | result.append((r.x1, r.y1, r.x2, r.y2)) 35 | return result 36 | finally: 37 | if rectangles: 38 | GLib.free(ctypes.addressof(rectangles.contents)) 39 | 40 | def main(): 41 | 42 | print 'Version:', Poppler.get_version() 43 | path=sys.argv[1] 44 | if not os.path.isabs(path): 45 | path=os.path.join(os.getcwd(), path) 46 | d=Poppler.Document.new_from_file('file:'+path) 47 | n=d.get_n_pages() 48 | for pg_no in range(n): 49 | p=d.get_page(pg_no) 50 | print 'Page %d' % (pg_no+1), 'size ', p.get_size() 51 | text=p.get_text().decode('UTF-8') 52 | locs=get_page_layout(p) 53 | fonts=p.get_text_attributes() 54 | offset=0 55 | cfont=0 56 | for line in text.splitlines(True): 57 | print ' ', line.encode('UTF-8'), 58 | n=len(line) 59 | for i in range(n): 60 | if line[i]==u'\n': 61 | continue 62 | font=fonts[cfont] 63 | while font.start_index > i+offset or font.end_index < i+offset: 64 | cfont+=1 65 | if cfont>= len(fonts): 66 | font=None 67 | break 68 | font=fonts[cfont] 69 | 70 | bb=locs[offset+i] 71 | print line[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)' % bb, 72 | if font: 73 | print font.font_name, font.font_size, 'r=%d g=%d, b=%d'%(font.color.red, font.color.green, font.color.blue), 74 | offset+=n 75 | print 76 | 77 | print 78 | 79 | 80 | #p.free_text_attributes(fonts) 81 | 82 | 83 | if __name__=='__main__': 84 | main() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | 4 | import os 5 | import subprocess 6 | import sys 7 | import re 8 | 9 | from setuptools import setup 10 | 11 | try: 12 | from Cython.Build import cythonize 13 | from Cython.Distutils import Extension, build_ext 14 | except ImportError: 15 | print('You need to install cython first - sudo pip install cython', file=sys.stderr) 16 | sys.exit(1) 17 | 18 | 19 | # https://gist.github.com/smidm/ff4a2c079fed97a92e9518bd3fa4797c 20 | def pkgconfig(*packages, **kw): 21 | """ 22 | Query pkg-config for library compile and linking options. Return configuration in distutils 23 | Extension format. 24 | 25 | Usage: 26 | 27 | pkgconfig('opencv') 28 | 29 | pkgconfig('opencv', 'libavformat') 30 | 31 | pkgconfig('opencv', optional='--static') 32 | 33 | pkgconfig('opencv', config=c) 34 | 35 | returns e.g. 36 | 37 | {'extra_compile_args': [], 38 | 'extra_link_args': [], 39 | 'include_dirs': ['/usr/include/ffmpeg'], 40 | 'libraries': ['avformat'], 41 | 'library_dirs': []} 42 | 43 | Intended use: 44 | 45 | distutils.core.Extension('pyextension', sources=['source.cpp'], **c) 46 | 47 | Set PKG_CONFIG_PATH environment variable for nonstandard library locations. 48 | 49 | based on work of Micah Dowty (http://code.activestate.com/recipes/502261-python-distutils-pkg-config/) 50 | """ 51 | config = kw.setdefault('config', {}) 52 | optional_args = kw.setdefault('optional', '') 53 | # { : [, ], ...} 54 | flag_map = {'include_dirs': ['--cflags-only-I', 2], 55 | 'library_dirs': ['--libs-only-L', 2], 56 | 'libraries': ['--libs-only-l', 2], 57 | 'extra_compile_args': ['--cflags-only-other', 0], 58 | 'extra_link_args': ['--libs-only-other', 0], 59 | } 60 | for package in packages: 61 | for distutils_key, (pkg_option, n) in flag_map.items(): 62 | items = subprocess.check_output(['pkg-config', optional_args, pkg_option, package]).decode('utf8').split() 63 | config.setdefault(distutils_key, []).extend([i[n:] for i in items]) 64 | return config 65 | 66 | # Poppler 0.72.0+ GooString.h uses c_str() instead of getCString() 67 | def use_poppler_cstring(path): 68 | for el in path.split(os.path.sep)[::-1]: 69 | version = el.split('.') 70 | if len(version) == 3 and (int(version[0]) > 0 or int(version[1]) >= 72): 71 | return True 72 | return False 73 | 74 | # Mac OS build fix: 75 | mac_compile_args = ["-std=c++11", "-stdlib=libc++", "-mmacosx-version-min=10.7"] 76 | POPPLER_ROOT = os.environ.get('POPPLER_ROOT', None) 77 | if POPPLER_ROOT: 78 | POPPLER_CPP_LIB_DIR = os.path.join(POPPLER_ROOT, 'cpp/') 79 | poppler_ext = Extension('pdfparser.poppler', ['pdfparser/poppler.pyx'], language='c++', 80 | extra_compile_args=mac_compile_args if sys.platform == 'darwin' else ["-std=c++11"], 81 | include_dirs=[POPPLER_ROOT, os.path.join(POPPLER_ROOT, 'poppler')], 82 | library_dirs=[POPPLER_ROOT, POPPLER_CPP_LIB_DIR], 83 | runtime_library_dirs=['$ORIGIN'], 84 | libraries=['poppler','poppler-cpp'], 85 | cython_compile_time_env={'USE_CSTRING': use_poppler_cstring(POPPLER_ROOT)}) 86 | package_data = {'pdfparser': ['*.so.*', 'pdfparser/*.so.*']} 87 | else: 88 | poppler_config = pkgconfig("poppler", "poppler-cpp") 89 | # Mac OS build fix: 90 | if sys.platform == 'darwin': 91 | poppler_config.setdefault('extra_compile_args', []).extend(mac_compile_args) 92 | poppler_config.setdefault('extra_link_args', []).extend(mac_compile_args) 93 | 94 | poppler_config.setdefault('cython_compile_time_env', {}).update({ 95 | 'USE_CSTRING': use_poppler_cstring(poppler_config['include_dirs'][0]) 96 | }) 97 | poppler_ext = Extension('pdfparser.poppler', ['pdfparser/poppler.pyx'], language='c++', **poppler_config) 98 | package_data = {} 99 | 100 | # get version from package 101 | pkg_file= os.path.join(os.path.split(__file__)[0], 'pdfparser', '__init__.py') 102 | m=re.search(r"__version__\s*=\s*'([\d.]+)'", open(pkg_file).read()) 103 | if not m: 104 | print (sys.stderr, 'Cannot find version of package') 105 | sys.exit(1) 106 | version= m.group(1) 107 | 108 | setup(name='pdfparser', 109 | version = version, 110 | classifiers=[ 111 | # How mature is this project? Common values are 112 | # 3 - Alpha 113 | # 4 - Beta 114 | # 5 - Production/Stable 115 | 'Development Status :: 4 - Beta', 116 | 117 | # Indicate who your project is intended for 118 | 'Intended Audience :: Developers', 119 | 'Topic :: Software Development :: PDF Parsing', 120 | 121 | 'License :: OSI Approved :: GPLv3', 122 | 123 | 'Programming Language :: Python :: 2.7', 124 | 'Programming Language :: Python :: 3.5', 125 | 'Programming Language :: Python :: 3.6', 126 | ], 127 | description="python bindings for poppler", 128 | long_description="Binding for libpoppler with a focus on fast text extraction from PDF documents.", 129 | keywords='poppler pdf parsing mining extracting', 130 | url='https://github.com/izderadicka/pdfparser', 131 | install_requires=['cython', ], 132 | packages=['pdfparser', ], 133 | package_data=package_data, 134 | include_package_data=True, 135 | cmdclass={"build_ext": build_ext}, 136 | ext_modules=[poppler_ext], # a workaround since Extension is an old-style class 137 | # removed cythonize for the list in ext_modules 138 | zip_safe=False 139 | ) 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pdfparser 2 | 3 | [![Python build and test](https://github.com/izderadicka/pdfparser/actions/workflows/python-package.yml/badge.svg)](https://github.com/izderadicka/pdfparser/actions/workflows/python-package.yml) 4 | 5 | Python binding for libpoppler - focused on text extraction from PDF documents. 6 | 7 | Intended as an easy to use replacement for [pdfminer](https://github.com/euske/pdfminer), 8 | which provides much better performance (see below for short comparison) and is Python3 compatible. 9 | 10 | See this [article](http://zderadicka.eu/parsing-pdf-for-fun-and-profit-indeed-in-python/) 11 | for some comparisons with pdfminer and other approaches. 12 | 13 | 14 | Binding is written in [cython](http://cython.org/). 15 | 16 | Requires recent libpoppler >= 0.40 - so I'd recommend to build it from source to get latest library, 17 | but it works also with recent libpoppler library present in common linux distributions (then it requires 18 | dev package to build). See below for installation instructions. 19 | 20 | 21 | Available under GPL v3 or any later version license (libpoppler is also GPL). 22 | 23 | ## How to install 24 | 25 | Below or some instructions to install this package 26 | 27 | ### CentOS 7 - system-wide libpoppler (pkg-config method) 28 | 29 | Install the poppler-devel package (Tested with version 0.26.5-16.el7) 30 | 31 | yum install poppler-devel poppler-cpp-devel 32 | 33 | Install cython 34 | 35 | pip install cython 36 | 37 | Install the repo 38 | 39 | pip install git+https://github.com/izderadicka/pdfparser 40 | 41 | ### CentOS 7 - self compiled method 42 | 43 | Clone this repo and enter into the root folder 44 | 45 | cd /git/repos/ 46 | git clone https://github.com/izderadicka/pdfparser.git 47 | cd pdfparser 48 | 49 | Clone the poppler repo and install (similar to build_poppler.sh) 50 | 51 | yum install openjpeg2-devel libjpeg-turbo-devel cmake 52 | git clone --depth 1 https://anongit.freedesktop.org/git/poppler/poppler.git poppler_src 53 | cd poppler_src 54 | cmake -DENABLE_SPLASH=OFF -DENABLE_UTILS=OFF -DENABLE_LIBOPENJPEG=none . 55 | make 56 | cp libpoppler.so.?? ../pdfparser/ 57 | cp cpp/libpoppler-cpp.so.? ../pdfparser 58 | cd .. 59 | POPPLER_ROOT=poppler_src python setup.py install 60 | 61 | 62 | ### Debian like - self compiled method (with local poppler library) 63 | 64 | ``` 65 | git clone --depth 1 https://github.com/izderadicka/pdfparser.git 66 | cd pdfparser 67 | ./build_poppler.sh 68 | pip install cython 69 | POPPLER_ROOT=poppler_src ./setup.py install 70 | #test that it works 71 | python tests/dump_file.py test_docs/test1.pdf 72 | ``` 73 | 74 | ### Debian like - system wide libpoppler 75 | ``` 76 | sudo apt-get update 77 | sudo apt-get install -y libpoppler-private-dev libpoppler-cpp-dev 78 | pip install cython 79 | pip install git+https://github.com/izderadicka/pdfparser 80 | ``` 81 | 82 | ### Mac OS 83 | ``` 84 | pip install cython 85 | pip install git+https://github.com/izderadicka/pdfparser 86 | ``` 87 | 88 | ## Usage 89 | 90 | See `tests/dump_file.py` for available arguments and some basic example. 91 | ``` 92 | usage: dump_file.py [-h] [--char-details] [-f FIRST_PAGE] [-l LAST_PAGE] 93 | [--phys-layout] [--fixed-pitch FIXED_PITCH] [-q] 94 | document 95 | 96 | positional arguments: 97 | document Document file 98 | 99 | optional arguments: 100 | -h, --help show this help message and exit 101 | --char-details print character details 102 | -f FIRST_PAGE, --first-page FIRST_PAGE 103 | first page 104 | -l LAST_PAGE, --last-page LAST_PAGE 105 | first page 106 | --phys-layout Physical Layout - param for text analysis 107 | --fixed-pitch FIXED_PITCH 108 | Fixed pitch - param for text analysis - app. max space size 109 | -q, --quiet Silence all output from poppler 110 | ``` 111 | 112 | ## Speed comparisons 113 | 114 | | | pdfreader | pdfminer |speed-up factor| 115 | | --------------------------- | ------------- | ------------- |---------------| 116 | | tiny document (half page) | 0.033s | 0.121s | 3.6 x | 117 | | small document (5 pages) | 0.141s | 0.810s | 5.7 x | 118 | | medium document (55 pages) | 1.166s | 10.524s | 9.0 x | 119 | | large document (436 pages) | 10.581s | 108.095s | 10.2 x | 120 | 121 | 122 | pdfparser code used in test 123 | 124 | import pdfparser.poppler as pdf 125 | import sys 126 | 127 | d=pdf.Document(sys.argv[1]) 128 | 129 | print('No of pages', d.no_of_pages) 130 | for p in d: 131 | print('Page', p.page_no, 'size =', p.size) 132 | for f in p: 133 | print(' '*1,'Flow') 134 | for b in f: 135 | print(' '*2,'Block', 'bbox=', b.bbox.as_tuple()) 136 | for l in b: 137 | print(' '*3, l.text.encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% l.bbox.as_tuple()) 138 | #assert l.char_fonts.comp_ratio < 1.0 139 | for i in range(len(l.text)): 140 | print(l.text[i].encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% l.char_bboxes[i].as_tuple(),\ 141 | l.char_fonts[i].name, l.char_fonts[i].size, l.char_fonts[i].color,) 142 | print() 143 | 144 | ## How to modify parsing algorithm? 145 | 146 | As you probably know PDF is document format intended for printing, so all logical structure of the text 147 | is lost (paragraphs, columns, tables, etc.). libpoppler is trying to reconstruct some of this logical 148 | structure of the document back by comparing physical positions of characters on the page and their mutual 149 | distances and reconstructing back words, lines, paragraphs, columns. 150 | 151 | 152 | Component which is responsible for this reconstruction is C++ class `TextOutputDev` (in poppler/TextOutputDev.cc). 153 | It's using many constants for this jobs, vast majority of constants in hardcoded into code. 154 | Actually the only parameter that is available to Python code is combination of parameters `phys_layout` and 155 | `fixed_pitch`, which influences how text is ordered into columns. If you put `phys_layout` to True and 156 | `fixed_pitch` to value > 0, then `fixed_pitch` will be used as maximum distance between words in a line and 157 | minimum distance between columns (in pixels). I think `phys_layout` also influences order of boxes 158 | in page iteration. However influence of these parameters is not quite straight forward - so you'll need to 159 | experiment to see how it works in your case. 160 | 161 | 162 | Another problem I encoutered is vertical spacing between lines in single box (paragraph) - this parameter 163 | is unfortunatelly fixed in libpoppler - it's constant `maxLineSpacingDelta` in poppler/TextOutputDev.cc, which 164 | is set to 1.5 (font size). If you need to accept bigger line spacing in paragraph then, you'll have to change it 165 | in C++ code and recompile libpoppler (in this case I recommend to make library local to pdfparser package). 166 | I've tried with value 2.0 and it seems to work fine. 167 | 168 | -------------------------------------------------------------------------------- /pdfparser/poppler.pyx: -------------------------------------------------------------------------------- 1 | from libcpp cimport bool 2 | from libcpp.string cimport string 3 | from cpython cimport bool as PyBool 4 | from cpython.object cimport Py_EQ, Py_NE 5 | 6 | ctypedef bool GBool 7 | DEF PRECISION=1e-6 8 | 9 | cdef extern from "cpp/poppler-version.h" namespace "poppler": 10 | cdef string version_string() 11 | 12 | def poppler_version(): 13 | return version_string() 14 | 15 | cdef extern from "GlobalParams.h": 16 | GlobalParams *globalParams 17 | cdef cppclass GlobalParams: 18 | void setErrQuiet(bool) 19 | bool getErrQuiet() 20 | # we need to init globalParams - just once during program run 21 | globalParams = new GlobalParams() 22 | 23 | IF USE_CSTRING: 24 | cdef extern from "goo/GooString.h": 25 | cdef cppclass GooString: 26 | GooString(const char *sA) 27 | int getLength() 28 | const char *c_str() 29 | char getChar(int i) 30 | ELSE: 31 | cdef extern from "goo/GooString.h": 32 | cdef cppclass GooString: 33 | GooString(const char *sA) 34 | int getLength() 35 | const char *getCString() 36 | char getChar(int i) 37 | cdef extern from "OutputDev.h": 38 | cdef cppclass OutputDev: 39 | pass 40 | 41 | cdef extern from 'Annot.h': 42 | cdef cppclass Annot: 43 | pass 44 | 45 | cdef extern from "PDFDoc.h": 46 | cdef cppclass PDFDoc: 47 | int getNumPages() 48 | void displayPage(OutputDev *out, int page, 49 | double hDPI, double vDPI, int rotate, 50 | GBool useMediaBox, GBool crop, GBool printing, 51 | GBool (*abortCheckCbk)(void *data) = NULL, 52 | void *abortCheckCbkData = NULL, 53 | GBool (*annotDisplayDecideCbk)(Annot *annot, void *user_data) = NULL, 54 | void *annotDisplayDecideCbkData = NULL, GBool copyXRef = False) 55 | double getPageMediaWidth(int page) 56 | double getPageMediaHeight(int page) 57 | 58 | cdef extern from "PDFDocFactory.h": 59 | cdef cppclass PDFDocFactory: 60 | PDFDocFactory() 61 | PDFDoc *createPDFDoc(const GooString &uri, GooString *ownerPassword = NULL, 62 | GooString *userPassword = NULL, void *guiDataA = NULL) 63 | 64 | cdef extern from "TextOutputDev.h": 65 | cdef cppclass TextOutputDev: 66 | TextOutputDev(char *fileName, GBool physLayoutA, 67 | double fixedPitchA, GBool rawOrderA, GBool append) 68 | TextPage *takeText() 69 | 70 | cdef cppclass TextPage: 71 | void incRefCnt() 72 | void decRefCnt() 73 | TextFlow *getFlows() 74 | 75 | cdef cppclass TextFlow: 76 | TextFlow *getNext() 77 | TextBlock *getBlocks() 78 | 79 | cdef cppclass TextBlock: 80 | TextBlock *getNext() 81 | TextLine *getLines() 82 | void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) 83 | 84 | cdef cppclass TextLine: 85 | TextWord *getWords() 86 | TextLine *getNext() 87 | 88 | cdef cppclass TextWord: 89 | TextWord *getNext() 90 | int getLength() 91 | GooString *getText() 92 | void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) 93 | void getCharBBox(int charIdx, double *xMinA, double *yMinA, 94 | double *xMaxA, double *yMaxA) 95 | GBool hasSpaceAfter () 96 | TextFontInfo *getFontInfo(int idx) 97 | GooString *getFontName(int idx) 98 | double getFontSize() 99 | void getColor(double *r, double *g, double *b) 100 | 101 | cdef cppclass TextFontInfo: 102 | GooString *getFontName() 103 | double getAscent(); 104 | double getDescent(); 105 | 106 | GBool isFixedWidth() 107 | GBool isSerif() 108 | GBool isSymbolic() 109 | GBool isItalic() 110 | GBool isBold() 111 | 112 | 113 | 114 | cdef double RESOLUTION=72.0 115 | 116 | 117 | cdef class Document: 118 | cdef: 119 | PDFDoc *_doc 120 | int _pg 121 | PyBool phys_layout 122 | double fixed_pitch 123 | def __cinit__(self, char *fname, PyBool phys_layout=False, double fixed_pitch=0, PyBool quiet=False): 124 | self._doc=PDFDocFactory().createPDFDoc(GooString(fname)) 125 | self._pg=0 126 | self.phys_layout=phys_layout 127 | self.fixed_pitch=fixed_pitch 128 | 129 | if quiet: 130 | globalParams.setErrQuiet(True) 131 | 132 | def __dealloc__(self): 133 | if self._doc != NULL: 134 | del self._doc 135 | 136 | property no_of_pages: 137 | def __get__(self): 138 | return self._doc.getNumPages() 139 | 140 | cdef void render_page(self, int page_no, OutputDev *dev): 141 | self._doc.displayPage(dev, page_no, RESOLUTION, RESOLUTION, 0, True, False, False) 142 | 143 | cdef object get_page_size(self, page_no): 144 | cdef double w,h 145 | w=self._doc.getPageMediaWidth(page_no) 146 | h= self._doc.getPageMediaHeight(page_no) 147 | return (w,h) 148 | 149 | def __iter__(self): 150 | return self 151 | 152 | def get_page(self, int pg): 153 | return Page(pg, self) 154 | 155 | def __next__(self): 156 | if self._pg >= self.no_of_pages: 157 | raise StopIteration() 158 | self._pg+=1 159 | return self.get_page(self._pg) 160 | 161 | 162 | 163 | cdef class Page: 164 | cdef: 165 | int page_no 166 | TextPage *page 167 | Document doc 168 | TextFlow *curr_flow 169 | 170 | def __cinit__(self, int page_no, Document doc): 171 | cdef TextOutputDev *dev 172 | self.page_no=page_no 173 | dev = new TextOutputDev(NULL, doc.phys_layout, doc.fixed_pitch, False, False); 174 | doc.render_page(page_no, dev) 175 | self.page= dev.takeText() 176 | del dev 177 | self.curr_flow = self.page.getFlows() 178 | self.doc=doc 179 | 180 | def __dealloc__(self): 181 | if self.page != NULL: 182 | self.page.decRefCnt() 183 | 184 | def __iter__(self): 185 | return self 186 | 187 | def __next__(self): 188 | cdef Flow f 189 | if not self.curr_flow: 190 | raise StopIteration() 191 | f=Flow(self) 192 | self.curr_flow=self.curr_flow.getNext() 193 | return f 194 | 195 | property page_no: 196 | def __get__(self): 197 | return self.page_no 198 | 199 | property size: 200 | """Size of page as (width, height)""" 201 | def __get__(self): 202 | return self.doc.get_page_size(self.page_no) 203 | 204 | cdef class Flow: 205 | cdef: 206 | TextFlow *flow 207 | TextBlock *curr_block 208 | 209 | def __cinit__(self, Page pg): 210 | self.flow=pg.curr_flow 211 | self.curr_block=self.flow.getBlocks() 212 | 213 | def __iter__(self): 214 | return self 215 | 216 | def __next__(self): 217 | cdef Block b 218 | if not self.curr_block: 219 | raise StopIteration() 220 | b=Block(self) 221 | self.curr_block=self.curr_block.getNext() 222 | return b 223 | 224 | cdef class Block: 225 | cdef: 226 | TextBlock *block 227 | TextLine *curr_line 228 | 229 | def __cinit__(self, Flow flow): 230 | self.block= flow.curr_block 231 | self.curr_line=self.block.getLines() 232 | 233 | #TODO - do we need to delete blocks, lines ... or are they destroyed with page? 234 | # def __dealloc__(self): 235 | # if self.block != NULL: 236 | # del self.block 237 | 238 | def __iter__(self): 239 | return self 240 | 241 | def __next__(self): 242 | cdef Line l 243 | if not self.curr_line: 244 | raise StopIteration() 245 | l=Line(self) 246 | self.curr_line=self.curr_line.getNext() 247 | return l 248 | 249 | property bbox: 250 | def __get__(self): 251 | cdef double x1,y1,x2,y2 252 | self.block.getBBox(&x1, &y1, &x2, &y2) 253 | return BBox(x1,y1,x2,y2) 254 | 255 | cdef class BBox: 256 | cdef double x1, y1, x2, y2 257 | 258 | def __cinit__(self, double x1, double y1, double x2, double y2 ): 259 | self.x1=x1 260 | self.x2=x2 261 | self.y1=y1 262 | self.y2=y2 263 | 264 | def as_tuple(self): 265 | return self.x1,self.y1, self.x2, self.y2 266 | 267 | def __getitem__(self, i): 268 | if i==0: 269 | return self.x1 270 | elif i==1: 271 | return self.y1 272 | elif i==2: 273 | return self.x2 274 | elif i==3: 275 | return self.y2 276 | raise IndexError() 277 | 278 | property x1: 279 | def __get__(self): 280 | return self.x1 281 | def __set__(self, double val): 282 | self.x1=val 283 | 284 | property x2: 285 | def __get__(self): 286 | return self.x2 287 | def __set__(self, double val): 288 | self.x2=val 289 | 290 | property y1: 291 | def __get__(self): 292 | return self.y1 293 | def __set__(self, double val): 294 | self.y1=val 295 | 296 | property y2: 297 | def __get__(self): 298 | return self.y2 299 | def __set__(self, double val): 300 | self.y2=val 301 | 302 | cdef class Color: 303 | cdef: 304 | double r,b,g 305 | 306 | def __cinit__(self, double r, double g, double b): 307 | self.r = r 308 | self.g = g 309 | self.b = b 310 | 311 | 312 | def as_tuple(self): 313 | return self.r,self.g, self.b 314 | 315 | property r: 316 | def __get__(self): 317 | return self.r 318 | 319 | 320 | property g: 321 | def __get__(self): 322 | return self.g 323 | 324 | 325 | property b: 326 | def __get__(self): 327 | return self.b 328 | 329 | 330 | def __str__(self): 331 | return 'r:%0.2f g:%0.2f, b:%0.2f' % self.as_tuple() 332 | 333 | def __richcmp__(x, y, op): 334 | if isinstance(x, Color) and isinstance(y, Color) and (op == Py_EQ or op == Py_NE): 335 | eq = abs(x.r - y.r) < PRECISION and \ 336 | abs(x.g -y.g) < PRECISION and \ 337 | abs(x.b -y.b) < PRECISION 338 | return eq if op == Py_EQ else not eq 339 | return NotImplemented 340 | 341 | 342 | cdef class FontInfo: 343 | cdef: 344 | unicode name 345 | double size 346 | Color color 347 | 348 | def __cinit__(self, unicode name, double size, Color color): 349 | nparts=name.split('+',1) 350 | self.name=nparts[-1] 351 | self.size=size 352 | self.color=color 353 | 354 | property name: 355 | def __get__(self): 356 | return self.name 357 | def __set__(self, unicode val): 358 | self.name=val 359 | 360 | property size: 361 | def __get__(self): 362 | return self.size 363 | def __set__(self, double val): 364 | self.size=val 365 | 366 | property color: 367 | def __get__(self): 368 | return self.color 369 | def __set__(self, Color val): 370 | self.color=val 371 | 372 | def __richcmp__(x, y, op): 373 | if isinstance(x, FontInfo) and isinstance(y, FontInfo) and (op == Py_EQ or op == Py_NE): 374 | eq = x.name == y.name and \ 375 | abs(x.size -y.size) < PRECISION and \ 376 | x.color == y.color 377 | return eq if op == Py_EQ else not eq 378 | return NotImplemented 379 | 380 | 381 | 382 | cdef class CompactListIterator: 383 | cdef: 384 | list index 385 | list items 386 | int pos 387 | 388 | def __cinit__(self, list index, list items): 389 | self.pos=0 390 | self.index=index 391 | self.items=items 392 | 393 | def __next__(self): 394 | if self.pos >= len(self.index): 395 | raise StopIteration() 396 | i= self.items[self.index[self.pos]] 397 | self.pos+=1 398 | return i 399 | 400 | 401 | cdef class CompactList: 402 | cdef: 403 | list index 404 | list items 405 | 406 | def __init__(self): 407 | self.index=[] 408 | self.items=[] 409 | 410 | 411 | def append(self, v): 412 | cdef long last 413 | last=len(self.items)-1 414 | if last>=0 and self.items[last] == v: 415 | self.index.append(last) 416 | else: 417 | self.items.append(v) 418 | self.index.append(last+1) 419 | 420 | def __getitem__(self, idx): 421 | return self.items[self.index[idx]] 422 | 423 | def __len__(self): 424 | return len(self.index) 425 | 426 | def __iter__(self): 427 | return CompactListIterator(self.index, self.items) 428 | 429 | property comp_ratio: 430 | def __get__(self): 431 | return float(len(self.items)) / len(self.index) 432 | 433 | 434 | 435 | cdef class Line: 436 | cdef: 437 | TextLine *line 438 | double x1, y1, x2, y2 439 | unicode _text 440 | list _bboxes 441 | CompactList _fonts 442 | 443 | 444 | def __cinit__(self, Block block): 445 | self.line = block.curr_line 446 | 447 | def __init__(self, Block block): 448 | self._text=u'' # text bytes 449 | self.x1 = 0 450 | self.y1 = 0 451 | self.x2 = 0 452 | self.y2 = 0 453 | self._bboxes=[] 454 | self._fonts=CompactList() 455 | self._get_text() 456 | assert len(self._text) == len(self._bboxes) 457 | 458 | def _get_text(self): 459 | cdef: 460 | TextWord *w 461 | GooString *s 462 | double bx1,bx2, by1, by2 463 | list words = [] 464 | int offset = 0, i, wlen 465 | BBox last_bbox 466 | FontInfo last_font 467 | double r,g,b 468 | 469 | w=self.line.getWords() 470 | while w: 471 | wlen=w.getLength() 472 | assert wlen>0 473 | # gets bounding boxes for all characters 474 | # and font info 475 | for i in range(wlen): 476 | w.getCharBBox(i, &bx1, &by1, &bx2, &by2 ) 477 | last_bbox=BBox(bx1,by1,bx2,by2) 478 | # if previous word is space update it's right end 479 | if i == 0 and words and words[-1] == u' ': 480 | self._bboxes[-1].x2=last_bbox.x1 481 | 482 | self._bboxes.append(last_bbox) 483 | w.getColor(&r, &g, &b) 484 | font_name=w.getFontName(i) 485 | if font_name != 0: 486 | IF USE_CSTRING: 487 | font_name_cstr = font_name.c_str() 488 | ELSE: 489 | font_name_cstr = font_name.getCString() 490 | else: 491 | font_name_cstr = b"unknown" # in rare cases font name can be NULL 492 | 493 | last_font=FontInfo(font_name_cstr.decode('UTF-8', 'replace'), # In rare cases font name is not UTF-8 494 | w.getFontSize(), 495 | Color(r,g,b) 496 | ) 497 | self._fonts.append(last_font) 498 | #and then text as UTF-8 bytes 499 | s=w.getText() 500 | #print s.getCString(), w.getLength(), len(s.getCString()) 501 | IF USE_CSTRING: 502 | s_cstr = s.c_str() 503 | ELSE: 504 | s_cstr = s.getCString() 505 | words.append(s_cstr.decode('UTF-8')) # decoded to python unicode string 506 | del s 507 | # must have same ammount of bboxes and characters in word 508 | assert len(words[-1]) == wlen 509 | #calculate line bbox 510 | w.getBBox(&bx1, &by1, &bx2, &by2) 511 | if bx1 < self.x1 or self.x1 == 0: 512 | self.x1=bx1 513 | if by1 < self.y1 or self.y1 == 0: 514 | self.y1= by1 515 | if bx2 > self.x2: 516 | self.x2=bx2 517 | if by2 > self.y2: 518 | self.y2=by2 519 | # add space after word if necessary 520 | if w.hasSpaceAfter(): 521 | words.append(u' ') 522 | self._bboxes.append(BBox(last_bbox.x2, last_bbox.y1, last_bbox.x2, last_bbox.y2)) 523 | self._fonts.append(last_font) 524 | w=w.getNext() 525 | self._text= u''.join(words) 526 | 527 | property bbox: 528 | def __get__(self): 529 | return BBox(self.x1,self.y1,self.x2,self.y2) 530 | 531 | property text: 532 | def __get__(self): 533 | return self._text 534 | 535 | property char_bboxes: 536 | def __get__(self): 537 | return self._bboxes 538 | 539 | property char_fonts: 540 | def __get__(self): 541 | return self._fonts 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | --------------------------------------------------------------------------------