├── .github └── workflows │ ├── cd.yml │ └── ci.yml ├── .gitignore ├── README.md ├── config.yml ├── manifest.yml ├── pdf_segmenter.py ├── requirements.txt └── tests ├── __init__.py ├── conftest.py ├── data ├── cats_are_awesome.pdf ├── cats_are_awesome_img.pdf ├── cats_are_awesome_text.pdf ├── test_img_0.jpg └── test_img_1.jpg ├── integration ├── __init__.py └── test_exec.py ├── requirements.txt └── unit ├── __init__.py └── test_exec.py /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | name: CD 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | release: 8 | types: 9 | - created 10 | workflow_dispatch: 11 | # pull_request: 12 | # uncomment the above to test CD in a PR 13 | 14 | jobs: 15 | call-external: 16 | uses: jina-ai/workflows-executors/.github/workflows/cd.yml@master 17 | with: 18 | event_name: ${{ github.event_name }} 19 | secrets: 20 | jinahub_token: ${{ secrets.JINAHUB_TOKEN }} 21 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | call-external: 7 | uses: jina-ai/workflows-executors/.github/workflows/ci.yml@master -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | docs/api/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | docs/.python-version 85 | 86 | # celery beat schedule file 87 | celerybeat-schedule 88 | 89 | # SageMath parsed files 90 | *.sage.py 91 | 92 | # Environments 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | .python-version 110 | 111 | # mypy 112 | .mypy_cache/ 113 | .dmypy.json 114 | dmypy.json 115 | 116 | # Pyre type checker 117 | .pyre/ 118 | .idea/ 119 | toy*.py 120 | .DS_Store 121 | post/ 122 | toy*.ipynb 123 | *.c 124 | .nes_cache 125 | toy*.yml 126 | *.tmp 127 | 128 | shell/jina-wizard.sh 129 | /junit/ 130 | /tests/junit/ 131 | /docs/chapters/proto/docs.md 132 | 133 | **/.jina/ 134 | 135 | # Local model downloads 136 | .cache 137 | assets 138 | benchmark.txt 139 | dump.txt 140 | import.txt 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ✨ PDFSegmenter 2 | 3 | PDFSegmenter is an Executor used for extracting images and text as chunks from PDF data. It stores each images and text of each page as chunks separately, with their respective mime types. It uses the [pdfplumber](https://github.com/jsvine/pdfplumber) library. 4 | 5 | ## Loading data 6 | 7 | The `PDFSegmenter` expects data to be found in the `Document`'s `.blob` attribute. This can be loaded from a PDF file like so 8 | 9 | ```python 10 | from docarray import DocumentArray, Document 11 | from jina import Flow 12 | 13 | doc = DocumentArray([Document(uri='cats_are_awesome.pdf')]) # adjust to your own pdf 14 | doc[0].load_uri_to_blob() 15 | print(doc[0]) 16 | 17 | f = Flow().add( 18 | uses='jinahub+docker://PDFSegmenter', 19 | ) 20 | with f: 21 | resp = f.post(on='/craft', inputs=doc) 22 | print(f'{[c.mime_type for c in resp[0].chunks]}') 23 | ``` 24 | 25 | 26 | ``` 27 | >> # notice `.blob` field is set 28 | >> ['image/*', 'image/*', 'text/plain'] # we get both images and text from a PDF 29 | ``` 30 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | jtype: PDFSegmenter 2 | metas: 3 | py_modules: 4 | - pdf_segmenter.py 5 | -------------------------------------------------------------------------------- /manifest.yml: -------------------------------------------------------------------------------- 1 | manifest_version: 1 2 | name: PDFSegmenter 3 | description: Executor to extract text and images from PDFs 4 | url: https://github.com/jina-ai/executor-pdfsegmenter 5 | keywords: [pdf, text, image] 6 | -------------------------------------------------------------------------------- /pdf_segmenter.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2020-2023 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | import io 5 | from typing import List 6 | 7 | import fitz 8 | import numpy as np 9 | import pdfplumber 10 | from jina import Document, DocumentArray, Executor, requests 11 | from jina.logging.logger import JinaLogger 12 | 13 | 14 | class PDFSegmenter(Executor): 15 | def __init__( 16 | self, 17 | *args, 18 | **kwargs, 19 | ): 20 | """ 21 | :class:`PDFSegmenter` Extracts data (text and images) from PDF files. 22 | Stores images (`mime_type`=image/*) on chunk level ('c') and text segments (`mime_type`=text/plain) 23 | on chunk level ('c') in the root ('r') Document. 24 | """ 25 | super().__init__(*args, **kwargs) 26 | self.logger = JinaLogger(context=self.__class__.__name__) 27 | 28 | @requests 29 | def craft(self, docs: DocumentArray, **kwargs): 30 | """ 31 | Read PDF files. Extracts data from them. 32 | Checks if the input is a string of the filename, 33 | or if it's the file in bytes. 34 | It will then extract the data from the file, creating a list for images, 35 | and text. 36 | 37 | :param docs: Array of Documents. 38 | """ 39 | for doc in docs: 40 | pdf_img, pdf_text = self._parse_pdf(doc) 41 | 42 | if pdf_img is not None: 43 | images = self._extract_image(pdf_img) 44 | doc.chunks.extend( 45 | [Document(tensor=img, mime_type='image/*') for img in images] 46 | ) 47 | if pdf_text is not None: 48 | texts = self._extract_text(pdf_text) 49 | doc.chunks.extend( 50 | [Document(text=t, mime_type='text/plain') for t in texts] 51 | ) 52 | 53 | def _parse_pdf(self, doc: Document): 54 | pdf_img = None 55 | pdf_text = None 56 | try: 57 | # when loading from URI, we should prioritize blob 58 | # order is important. check test `tests/unit/test_exec.py::test_order_blob_uri` 59 | if doc.blob: 60 | pdf_img = fitz.open(stream=doc.blob, filetype='pdf') 61 | pdf_text = pdfplumber.open(io.BytesIO(doc.blob)) 62 | elif doc.uri: 63 | pdf_img = fitz.open(doc.uri) 64 | pdf_text = pdfplumber.open(doc.uri) 65 | except Exception as ex: 66 | self.logger.error(f'Failed to open due to: {ex}') 67 | return pdf_img, pdf_text 68 | 69 | def _extract_text(self, pdf_text) -> List[str]: 70 | # Extract text 71 | with pdf_text: 72 | texts = [] 73 | count = len(pdf_text.pages) 74 | for i in range(count): 75 | page = pdf_text.pages[i] 76 | texts.append(page.extract_text(x_tolerance=1, y_tolerance=1)) 77 | return texts 78 | 79 | def _extract_image(self, pdf_img) -> List['np.ndarray']: 80 | with pdf_img: 81 | images = [] 82 | for page in range(len(pdf_img)): 83 | for img in pdf_img.get_page_images(page): 84 | xref = img[0] 85 | pix = fitz.Pixmap(pdf_img, xref) 86 | # read data from buffer and reshape the array into 3-d format 87 | np_arr = ( 88 | np.frombuffer(pix.samples, dtype=np.uint8) 89 | .reshape(pix.h, pix.w, pix.n) 90 | .astype('float32') 91 | ) 92 | if pix.n - pix.alpha < 4: # if gray or RGB 93 | if pix.n == 1: # convert gray to rgb 94 | images.append(np.concatenate((np_arr,) * 3, -1)) 95 | elif pix.n == 4: # remove transparency layer 96 | images.append(np_arr[..., :3]) 97 | else: 98 | images.append(np_arr) 99 | else: # if CMYK: 100 | pix = fitz.Pixmap(fitz.csRGB, pix) # Convert to RGB 101 | np_arr = ( 102 | np.frombuffer(pix.samples, dtype=np.uint8) 103 | .reshape(pix.h, pix.w, pix.n) 104 | .astype('float32') 105 | ) 106 | images.append(np_arr) 107 | return images 108 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyMuPDF==1.21.1 2 | pdfplumber==0.8.0 3 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from jina import Document, DocumentArray 5 | 6 | 7 | @pytest.fixture() 8 | def test_dir() -> str: 9 | return os.path.dirname(os.path.abspath(__file__)) 10 | 11 | 12 | @pytest.fixture 13 | def expected_text(): 14 | expected_text = ( 15 | "A cat poem\nI love cats, I love every kind of cat,\nI just wanna hug all of them, but I can't," 16 | "\nI'm thinking about cats again\nI think about how cute they are\nAnd their whiskers and their " 17 | "nose" 18 | ) 19 | return expected_text 20 | 21 | 22 | @pytest.fixture 23 | def input_pdf(test_dir: str): 24 | path_img_text = os.path.join(test_dir, 'data/cats_are_awesome.pdf') 25 | path_text = os.path.join(test_dir, 'data/cats_are_awesome_text.pdf') 26 | path_img = os.path.join(test_dir, 'data/cats_are_awesome_img.pdf') 27 | 28 | with open(path_text, 'rb') as pdf: 29 | input_bytes_text = pdf.read() 30 | 31 | with open(path_img, 'rb') as pdf: 32 | input_bytes_image = pdf.read() 33 | 34 | with open(path_img_text, 'rb') as pdf: 35 | input_bytes_images_text = pdf.read() 36 | 37 | return { 38 | 'img_text': [(path_img_text, None), (None, input_bytes_images_text)], 39 | 'text': [(path_text, None), (None, input_bytes_text)], 40 | 'img': [(path_img, None), (None, input_bytes_image)], 41 | } 42 | 43 | 44 | @pytest.fixture() 45 | def doc_generator_img_text(input_pdf): 46 | doc_arrays = [] 47 | for uri, buffer in input_pdf['img_text']: 48 | if uri: 49 | docs = DocumentArray([Document(uri=uri, mime_type='application/pdf')]) 50 | else: 51 | docs = DocumentArray([Document(blob=buffer, mime_type='application/pdf')]) 52 | doc_arrays.append(docs) 53 | return doc_arrays 54 | 55 | 56 | @pytest.fixture() 57 | def doc_generator_text(input_pdf): 58 | # import epdb; epdb.serve() 59 | doc_arrays = [] 60 | for uri, buffer in input_pdf['text']: 61 | if uri: 62 | docs = DocumentArray([Document(uri=uri, mime_type='application/pdf')]) 63 | else: 64 | docs = DocumentArray([Document(blob=buffer, mime_type='application/pdf')]) 65 | doc_arrays.append(docs) 66 | return doc_arrays 67 | 68 | 69 | @pytest.fixture() 70 | def doc_generator_img(input_pdf): 71 | doc_array = [] 72 | for uri, buffer in input_pdf['img']: 73 | if uri: 74 | doc = DocumentArray([Document(uri=uri, mime_type='application/pdf')]) 75 | else: 76 | doc = DocumentArray([Document(blob=buffer, mime_type='application/pdf')]) 77 | doc_array.append(doc) 78 | return doc_array 79 | -------------------------------------------------------------------------------- /tests/data/cats_are_awesome.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/data/cats_are_awesome.pdf -------------------------------------------------------------------------------- /tests/data/cats_are_awesome_img.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/data/cats_are_awesome_img.pdf -------------------------------------------------------------------------------- /tests/data/cats_are_awesome_text.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/data/cats_are_awesome_text.pdf -------------------------------------------------------------------------------- /tests/data/test_img_0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/data/test_img_0.jpg -------------------------------------------------------------------------------- /tests/data/test_img_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/data/test_img_1.jpg -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/integration/__init__.py -------------------------------------------------------------------------------- /tests/integration/test_exec.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2020-2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | import os 5 | 6 | from docarray import DocumentArray 7 | from jina import Flow 8 | from pdf_segmenter import PDFSegmenter 9 | from PIL import Image 10 | 11 | 12 | def test_flow(test_dir, doc_generator_img_text, expected_text): 13 | flow = Flow().add(uses=PDFSegmenter) 14 | doc_arrays = doc_generator_img_text 15 | for docs in doc_arrays: 16 | with flow: 17 | results = flow.post(on='/test', inputs=docs) 18 | 19 | assert len(results) == 1 20 | chunks = results[0].chunks 21 | assert len(chunks) == 3 22 | for idx, c in enumerate(chunks[:2]): 23 | with Image.open( 24 | os.path.join(test_dir, f'data/test_img_{idx}.jpg') 25 | ) as img: 26 | tensor = chunks[idx].tensor 27 | assert chunks[idx].mime_type == 'image/*' 28 | assert tensor.shape[1], tensor.shape[0] == img.size 29 | if idx == 0: 30 | assert tensor.shape == (660, 1024, 3) 31 | if idx == 1: 32 | assert tensor.shape == (626, 1191, 3) 33 | 34 | # Check text 35 | assert chunks[2].text == expected_text 36 | assert chunks[2].mime_type == 'text/plain' 37 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/test_exec.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "Copyright (c) 2020-2021 Jina AI Limited. All rights reserved." 2 | __license__ = "Apache-2.0" 3 | 4 | import os 5 | 6 | import pytest 7 | from jina import Document, DocumentArray, Executor 8 | from pdf_segmenter import PDFSegmenter 9 | from PIL import Image 10 | 11 | 12 | @pytest.fixture() 13 | def executor(): 14 | return PDFSegmenter() 15 | 16 | 17 | @pytest.fixture() 18 | def executor_from_config(): 19 | return Executor.load_config('config.yml') 20 | 21 | 22 | def test_empty_docs(executor): 23 | da = DocumentArray() 24 | executor.craft(da) 25 | assert len(da) == 0 26 | 27 | 28 | def test_none_input(executor): 29 | with pytest.raises(TypeError): 30 | executor.craft(None) 31 | 32 | 33 | def test_io_images_and_text( 34 | executor_from_config, test_dir, doc_generator_img_text, expected_text 35 | ): 36 | doc_array = doc_generator_img_text 37 | assert len(doc_array) > 0 38 | for doc in doc_array: 39 | executor_from_config.craft(doc) 40 | chunks = doc[0].chunks 41 | assert len(chunks) == 3 42 | # Check images 43 | for idx, c in enumerate(chunks[:2]): 44 | with Image.open(os.path.join(test_dir, f'data/test_img_{idx}.jpg')) as img: 45 | tensor = chunks[idx].tensor 46 | assert chunks[idx].mime_type == 'image/*' 47 | assert tensor.shape[1], tensor.shape[0] == img.size 48 | if idx == 0: 49 | assert tensor.shape == (660, 1024, 3) 50 | if idx == 1: 51 | assert tensor.shape == (626, 1191, 3) 52 | 53 | # Check text 54 | assert chunks[2].text == expected_text 55 | assert chunks[2].mime_type == 'text/plain' 56 | 57 | 58 | def test_io_text(executor_from_config, doc_generator_text, expected_text): 59 | doc_arrays = doc_generator_text 60 | assert len(doc_arrays) > 0 61 | for docs in doc_arrays: 62 | executor_from_config.craft(docs) 63 | chunks = docs[0].chunks 64 | assert len(chunks) == 1 65 | # Check test 66 | assert chunks[0].text == expected_text 67 | assert chunks[0].mime_type == 'text/plain' 68 | 69 | 70 | def test_io_img(executor_from_config, test_dir, doc_generator_img): 71 | doc_arrays = doc_generator_img 72 | assert len(doc_arrays) > 0 73 | for docs in doc_arrays: 74 | executor_from_config.craft(docs) 75 | chunks = docs[0].chunks 76 | assert len(chunks) == 3 77 | # Check images 78 | for idx, c in enumerate(chunks[:2]): 79 | with Image.open(os.path.join(test_dir, f'data/test_img_{idx}.jpg')) as img: 80 | tensor = chunks[idx].tensor 81 | assert chunks[idx].mime_type == 'image/*' 82 | assert tensor.shape[1], tensor.shape[0] == img.size 83 | if idx == 0: 84 | assert tensor.shape == (660, 1024, 3) 85 | if idx == 1: 86 | assert tensor.shape == (626, 1191, 3) 87 | 88 | 89 | def test_order_blob_uri(executor_from_config): 90 | pdf = 'tests/data/cats_are_awesome.pdf' 91 | doc = Document(uri=pdf) 92 | doc.load_uri_to_blob() 93 | docs = DocumentArray(doc) 94 | 95 | # this is why the order is important in `_parse_pdf` method in segmenter 96 | executor_from_config.craft(docs) 97 | 98 | assert len(docs[0].chunks) > 0 99 | --------------------------------------------------------------------------------