├── .github
    └── workflows
    │   ├── cd.yml
    │   └── ci.yml
├── .gitignore
├── README.md
├── config.yml
├── manifest.yml
├── pdf_segmenter.py
├── requirements.txt
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── data
        ├── cats_are_awesome.pdf
        ├── cats_are_awesome_img.pdf
        ├── cats_are_awesome_text.pdf
        ├── test_img_0.jpg
        └── test_img_1.jpg
    ├── integration
        ├── __init__.py
        └── test_exec.py
    ├── requirements.txt
    └── unit
        ├── __init__.py
        └── test_exec.py


/.github/workflows/cd.yml:
--------------------------------------------------------------------------------
 1 | name: CD
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   release:
 8 |     types:
 9 |       - created
10 |   workflow_dispatch:
11 |   # pull_request:
12 |   # uncomment the above to test CD in a PR
13 | 
14 | jobs:
15 |   call-external:
16 |     uses: jina-ai/workflows-executors/.github/workflows/cd.yml@master
17 |     with:
18 |       event_name: ${{ github.event_name }}
19 |     secrets:
20 |       jinahub_token: ${{ secrets.JINAHUB_TOKEN }}
21 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 | 
3 | on: [pull_request]
4 | 
5 | jobs:
6 |   call-external:
7 |     uses: jina-ai/workflows-executors/.github/workflows/ci.yml@master


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Initially taken from Github's Python gitignore file
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | docs/api/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | docs/.python-version
 85 | 
 86 | # celery beat schedule file
 87 | celerybeat-schedule
 88 | 
 89 | # SageMath parsed files
 90 | *.sage.py
 91 | 
 92 | # Environments
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | .python-version
110 | 
111 | # mypy
112 | .mypy_cache/
113 | .dmypy.json
114 | dmypy.json
115 | 
116 | # Pyre type checker
117 | .pyre/
118 | .idea/
119 | toy*.py
120 | .DS_Store
121 | post/
122 | toy*.ipynb
123 | *.c
124 | .nes_cache
125 | toy*.yml
126 | *.tmp
127 | 
128 | shell/jina-wizard.sh
129 | /junit/
130 | /tests/junit/
131 | /docs/chapters/proto/docs.md
132 | 
133 | **/.jina/
134 | 
135 | # Local model downloads
136 | .cache
137 | assets
138 | benchmark.txt
139 | dump.txt
140 | import.txt
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ✨ PDFSegmenter
 2 | 
 3 | PDFSegmenter is an Executor used for extracting images and text as chunks from PDF data. It stores each images and text of each page as chunks separately, with their respective mime types. It uses the [pdfplumber](https://github.com/jsvine/pdfplumber) library.
 4 | 
 5 | ## Loading data
 6 | 
 7 | The `PDFSegmenter` expects data to be found in the `Document`'s `.blob` attribute. This can be loaded from a PDF file like so
 8 | 
 9 | ```python
10 | from docarray import DocumentArray, Document
11 | from jina import Flow
12 | 
13 | doc = DocumentArray([Document(uri='cats_are_awesome.pdf')]) # adjust to your own pdf
14 | doc[0].load_uri_to_blob()
15 | print(doc[0])
16 | 
17 | f = Flow().add(
18 |     uses='jinahub+docker://PDFSegmenter',
19 | )
20 | with f:
21 |     resp = f.post(on='/craft', inputs=doc)
22 |     print(f'{[c.mime_type for c in resp[0].chunks]}')
23 | ```
24 | 
25 | 
26 | ```
27 | >> <Document ('id', 'blob', 'mime_type', 'uri') at 9d77e00f759bf8523e86abf452ac28a0> # notice `.blob` field is set
28 | >> ['image/*', 'image/*', 'text/plain'] # we get both images and text from a PDF
29 | ```
30 | 


--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
1 | jtype: PDFSegmenter
2 | metas:
3 |   py_modules:
4 |     - pdf_segmenter.py
5 | 


--------------------------------------------------------------------------------
/manifest.yml:
--------------------------------------------------------------------------------
1 | manifest_version: 1
2 | name: PDFSegmenter
3 | description: Executor to extract text and images from PDFs
4 | url: https://github.com/jina-ai/executor-pdfsegmenter
5 | keywords: [pdf, text, image]
6 | 


--------------------------------------------------------------------------------
/pdf_segmenter.py:
--------------------------------------------------------------------------------
  1 | __copyright__ = "Copyright (c) 2020-2023 Jina AI Limited. All rights reserved."
  2 | __license__ = "Apache-2.0"
  3 | 
  4 | import io
  5 | from typing import List
  6 | 
  7 | import fitz
  8 | import numpy as np
  9 | import pdfplumber
 10 | from jina import Document, DocumentArray, Executor, requests
 11 | from jina.logging.logger import JinaLogger
 12 | 
 13 | 
 14 | class PDFSegmenter(Executor):
 15 |     def __init__(
 16 |         self,
 17 |         *args,
 18 |         **kwargs,
 19 |     ):
 20 |         """
 21 |         :class:`PDFSegmenter` Extracts data (text and images) from PDF files.
 22 |         Stores images (`mime_type`=image/*) on chunk level ('c') and text segments (`mime_type`=text/plain)
 23 |         on chunk level ('c') in the root ('r') Document.
 24 |         """
 25 |         super().__init__(*args, **kwargs)
 26 |         self.logger = JinaLogger(context=self.__class__.__name__)
 27 | 
 28 |     @requests
 29 |     def craft(self, docs: DocumentArray, **kwargs):
 30 |         """
 31 |         Read PDF files. Extracts data from them.
 32 |         Checks if the input is a string of the filename,
 33 |         or if it's the file in bytes.
 34 |         It will then extract the data from the file, creating a list for images,
 35 |         and text.
 36 | 
 37 |         :param docs: Array of Documents.
 38 |         """
 39 |         for doc in docs:
 40 |             pdf_img, pdf_text = self._parse_pdf(doc)
 41 | 
 42 |             if pdf_img is not None:
 43 |                 images = self._extract_image(pdf_img)
 44 |                 doc.chunks.extend(
 45 |                     [Document(tensor=img, mime_type='image/*') for img in images]
 46 |                 )
 47 |             if pdf_text is not None:
 48 |                 texts = self._extract_text(pdf_text)
 49 |                 doc.chunks.extend(
 50 |                     [Document(text=t, mime_type='text/plain') for t in texts]
 51 |                 )
 52 | 
 53 |     def _parse_pdf(self, doc: Document):
 54 |         pdf_img = None
 55 |         pdf_text = None
 56 |         try:
 57 |             # when loading from URI, we should prioritize blob
 58 |             # order is important. check test `tests/unit/test_exec.py::test_order_blob_uri`
 59 |             if doc.blob:
 60 |                 pdf_img = fitz.open(stream=doc.blob, filetype='pdf')
 61 |                 pdf_text = pdfplumber.open(io.BytesIO(doc.blob))
 62 |             elif doc.uri:
 63 |                 pdf_img = fitz.open(doc.uri)
 64 |                 pdf_text = pdfplumber.open(doc.uri)
 65 |         except Exception as ex:
 66 |             self.logger.error(f'Failed to open due to: {ex}')
 67 |         return pdf_img, pdf_text
 68 | 
 69 |     def _extract_text(self, pdf_text) -> List[str]:
 70 |         # Extract text
 71 |         with pdf_text:
 72 |             texts = []
 73 |             count = len(pdf_text.pages)
 74 |             for i in range(count):
 75 |                 page = pdf_text.pages[i]
 76 |                 texts.append(page.extract_text(x_tolerance=1, y_tolerance=1))
 77 |             return texts
 78 | 
 79 |     def _extract_image(self, pdf_img) -> List['np.ndarray']:
 80 |         with pdf_img:
 81 |             images = []
 82 |             for page in range(len(pdf_img)):
 83 |                 for img in pdf_img.get_page_images(page):
 84 |                     xref = img[0]
 85 |                     pix = fitz.Pixmap(pdf_img, xref)
 86 |                     # read data from buffer and reshape the array into 3-d format
 87 |                     np_arr = (
 88 |                         np.frombuffer(pix.samples, dtype=np.uint8)
 89 |                         .reshape(pix.h, pix.w, pix.n)
 90 |                         .astype('float32')
 91 |                     )
 92 |                     if pix.n - pix.alpha < 4:  # if gray or RGB
 93 |                         if pix.n == 1:  # convert gray to rgb
 94 |                             images.append(np.concatenate((np_arr,) * 3, -1))
 95 |                         elif pix.n == 4:  # remove transparency layer
 96 |                             images.append(np_arr[..., :3])
 97 |                         else:
 98 |                             images.append(np_arr)
 99 |                     else:  # if CMYK:
100 |                         pix = fitz.Pixmap(fitz.csRGB, pix)  # Convert to RGB
101 |                         np_arr = (
102 |                             np.frombuffer(pix.samples, dtype=np.uint8)
103 |                             .reshape(pix.h, pix.w, pix.n)
104 |                             .astype('float32')
105 |                         )
106 |                         images.append(np_arr)
107 |         return images
108 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PyMuPDF==1.21.1
2 | pdfplumber==0.8.0
3 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | from jina import Document, DocumentArray
 5 | 
 6 | 
 7 | @pytest.fixture()
 8 | def test_dir() -> str:
 9 |     return os.path.dirname(os.path.abspath(__file__))
10 | 
11 | 
12 | @pytest.fixture
13 | def expected_text():
14 |     expected_text = (
15 |         "A cat poem\nI love cats, I love every kind of cat,\nI just wanna hug all of them, but I can't,"
16 |         "\nI'm thinking about cats again\nI think about how cute they are\nAnd their whiskers and their "
17 |         "nose"
18 |     )
19 |     return expected_text
20 | 
21 | 
22 | @pytest.fixture
23 | def input_pdf(test_dir: str):
24 |     path_img_text = os.path.join(test_dir, 'data/cats_are_awesome.pdf')
25 |     path_text = os.path.join(test_dir, 'data/cats_are_awesome_text.pdf')
26 |     path_img = os.path.join(test_dir, 'data/cats_are_awesome_img.pdf')
27 | 
28 |     with open(path_text, 'rb') as pdf:
29 |         input_bytes_text = pdf.read()
30 | 
31 |     with open(path_img, 'rb') as pdf:
32 |         input_bytes_image = pdf.read()
33 | 
34 |     with open(path_img_text, 'rb') as pdf:
35 |         input_bytes_images_text = pdf.read()
36 | 
37 |     return {
38 |         'img_text': [(path_img_text, None), (None, input_bytes_images_text)],
39 |         'text': [(path_text, None), (None, input_bytes_text)],
40 |         'img': [(path_img, None), (None, input_bytes_image)],
41 |     }
42 | 
43 | 
44 | @pytest.fixture()
45 | def doc_generator_img_text(input_pdf):
46 |     doc_arrays = []
47 |     for uri, buffer in input_pdf['img_text']:
48 |         if uri:
49 |             docs = DocumentArray([Document(uri=uri, mime_type='application/pdf')])
50 |         else:
51 |             docs = DocumentArray([Document(blob=buffer, mime_type='application/pdf')])
52 |         doc_arrays.append(docs)
53 |     return doc_arrays
54 | 
55 | 
56 | @pytest.fixture()
57 | def doc_generator_text(input_pdf):
58 |     # import epdb; epdb.serve()
59 |     doc_arrays = []
60 |     for uri, buffer in input_pdf['text']:
61 |         if uri:
62 |             docs = DocumentArray([Document(uri=uri, mime_type='application/pdf')])
63 |         else:
64 |             docs = DocumentArray([Document(blob=buffer, mime_type='application/pdf')])
65 |         doc_arrays.append(docs)
66 |     return doc_arrays
67 | 
68 | 
69 | @pytest.fixture()
70 | def doc_generator_img(input_pdf):
71 |     doc_array = []
72 |     for uri, buffer in input_pdf['img']:
73 |         if uri:
74 |             doc = DocumentArray([Document(uri=uri, mime_type='application/pdf')])
75 |         else:
76 |             doc = DocumentArray([Document(blob=buffer, mime_type='application/pdf')])
77 |         doc_array.append(doc)
78 |     return doc_array
79 | 


--------------------------------------------------------------------------------
/tests/data/cats_are_awesome.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/data/cats_are_awesome.pdf


--------------------------------------------------------------------------------
/tests/data/cats_are_awesome_img.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/data/cats_are_awesome_img.pdf


--------------------------------------------------------------------------------
/tests/data/cats_are_awesome_text.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/data/cats_are_awesome_text.pdf


--------------------------------------------------------------------------------
/tests/data/test_img_0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/data/test_img_0.jpg


--------------------------------------------------------------------------------
/tests/data/test_img_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/data/test_img_1.jpg


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/integration/__init__.py


--------------------------------------------------------------------------------
/tests/integration/test_exec.py:
--------------------------------------------------------------------------------
 1 | __copyright__ = "Copyright (c) 2020-2021 Jina AI Limited. All rights reserved."
 2 | __license__ = "Apache-2.0"
 3 | 
 4 | import os
 5 | 
 6 | from docarray import DocumentArray
 7 | from jina import Flow
 8 | from pdf_segmenter import PDFSegmenter
 9 | from PIL import Image
10 | 
11 | 
12 | def test_flow(test_dir, doc_generator_img_text, expected_text):
13 |     flow = Flow().add(uses=PDFSegmenter)
14 |     doc_arrays = doc_generator_img_text
15 |     for docs in doc_arrays:
16 |         with flow:
17 |             results = flow.post(on='/test', inputs=docs)
18 | 
19 |             assert len(results) == 1
20 |             chunks = results[0].chunks
21 |             assert len(chunks) == 3
22 |             for idx, c in enumerate(chunks[:2]):
23 |                 with Image.open(
24 |                     os.path.join(test_dir, f'data/test_img_{idx}.jpg')
25 |                 ) as img:
26 |                     tensor = chunks[idx].tensor
27 |                     assert chunks[idx].mime_type == 'image/*'
28 |                     assert tensor.shape[1], tensor.shape[0] == img.size
29 |                     if idx == 0:
30 |                         assert tensor.shape == (660, 1024, 3)
31 |                     if idx == 1:
32 |                         assert tensor.shape == (626, 1191, 3)
33 | 
34 |                 # Check text
35 |                 assert chunks[2].text == expected_text
36 |                 assert chunks[2].mime_type == 'text/plain'
37 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jina-ai/executor-pdfsegmenter/08a2fde6438eb3dcc766db389e12948b2a534fbe/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/test_exec.py:
--------------------------------------------------------------------------------
 1 | __copyright__ = "Copyright (c) 2020-2021 Jina AI Limited. All rights reserved."
 2 | __license__ = "Apache-2.0"
 3 | 
 4 | import os
 5 | 
 6 | import pytest
 7 | from jina import Document, DocumentArray, Executor
 8 | from pdf_segmenter import PDFSegmenter
 9 | from PIL import Image
10 | 
11 | 
12 | @pytest.fixture()
13 | def executor():
14 |     return PDFSegmenter()
15 | 
16 | 
17 | @pytest.fixture()
18 | def executor_from_config():
19 |     return Executor.load_config('config.yml')
20 | 
21 | 
22 | def test_empty_docs(executor):
23 |     da = DocumentArray()
24 |     executor.craft(da)
25 |     assert len(da) == 0
26 | 
27 | 
28 | def test_none_input(executor):
29 |     with pytest.raises(TypeError):
30 |         executor.craft(None)
31 | 
32 | 
33 | def test_io_images_and_text(
34 |     executor_from_config, test_dir, doc_generator_img_text, expected_text
35 | ):
36 |     doc_array = doc_generator_img_text
37 |     assert len(doc_array) > 0
38 |     for doc in doc_array:
39 |         executor_from_config.craft(doc)
40 |         chunks = doc[0].chunks
41 |         assert len(chunks) == 3
42 |         # Check images
43 |         for idx, c in enumerate(chunks[:2]):
44 |             with Image.open(os.path.join(test_dir, f'data/test_img_{idx}.jpg')) as img:
45 |                 tensor = chunks[idx].tensor
46 |                 assert chunks[idx].mime_type == 'image/*'
47 |                 assert tensor.shape[1], tensor.shape[0] == img.size
48 |                 if idx == 0:
49 |                     assert tensor.shape == (660, 1024, 3)
50 |                 if idx == 1:
51 |                     assert tensor.shape == (626, 1191, 3)
52 | 
53 |             # Check text
54 |             assert chunks[2].text == expected_text
55 |             assert chunks[2].mime_type == 'text/plain'
56 | 
57 | 
58 | def test_io_text(executor_from_config, doc_generator_text, expected_text):
59 |     doc_arrays = doc_generator_text
60 |     assert len(doc_arrays) > 0
61 |     for docs in doc_arrays:
62 |         executor_from_config.craft(docs)
63 |         chunks = docs[0].chunks
64 |         assert len(chunks) == 1
65 |         # Check test
66 |         assert chunks[0].text == expected_text
67 |         assert chunks[0].mime_type == 'text/plain'
68 | 
69 | 
70 | def test_io_img(executor_from_config, test_dir, doc_generator_img):
71 |     doc_arrays = doc_generator_img
72 |     assert len(doc_arrays) > 0
73 |     for docs in doc_arrays:
74 |         executor_from_config.craft(docs)
75 |         chunks = docs[0].chunks
76 |         assert len(chunks) == 3
77 |         # Check images
78 |         for idx, c in enumerate(chunks[:2]):
79 |             with Image.open(os.path.join(test_dir, f'data/test_img_{idx}.jpg')) as img:
80 |                 tensor = chunks[idx].tensor
81 |                 assert chunks[idx].mime_type == 'image/*'
82 |                 assert tensor.shape[1], tensor.shape[0] == img.size
83 |                 if idx == 0:
84 |                     assert tensor.shape == (660, 1024, 3)
85 |                 if idx == 1:
86 |                     assert tensor.shape == (626, 1191, 3)
87 | 
88 | 
89 | def test_order_blob_uri(executor_from_config):
90 |     pdf = 'tests/data/cats_are_awesome.pdf'
91 |     doc = Document(uri=pdf)
92 |     doc.load_uri_to_blob()
93 |     docs = DocumentArray(doc)
94 | 
95 |     # this is why the order is important in `_parse_pdf` method in segmenter
96 |     executor_from_config.craft(docs)
97 | 
98 |     assert len(docs[0].chunks) > 0
99 | 


--------------------------------------------------------------------------------