├── .github
    └── workflows
    │   └── python-publish.yml
├── LICENSE
├── README.md
├── bill7.png
├── doc_transformers
    ├── __init__.py
    └── parser.py
├── output.png
├── result.csv
├── setup.cfg
└── setup.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Vishnu Nandakumar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Doc Transformers
 2 | Document processing using transformers. This is still in developmental phase, currently supports only extraction of form data i.e (key - value pairs)
 3 | 
 4 | ```bash
 5 | pip install -q doc-transformers
 6 | ```
 7 | 
 8 | ## Pre-requisites
 9 | 
10 | Please install the following seperately
11 | ```
12 | pip install pip --upgrade
13 | pip install -q git+https://github.com/huggingface/transformers.git
14 | 
15 | pip install pyyaml==5.1
16 | 
17 | # workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
18 | pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
19 | 
20 | # install detectron2 that matches pytorch 1.8
21 | # See https://detectron2.readthedocs.io/tutorials/install.html for instructions
22 | pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
23 | ```
24 | 
25 | ## Implementation
26 | 
27 | ```python
28 | # loads the pretrained dataset also 
29 | from doc_transformers import parser
30 | 
31 | # loads the image and labels
32 | image = parser.load_image(input_path_image)
33 | labels = parser.load_tags()
34 | 
35 | # loads the model
36 | feature_extractor, processor, model = parser.load_models()
37 | 
38 | # gets the bounding boxes, predictions, extracted words and image processed
39 | kp = parser.process_image(image, feature_extractor, processor, model, labels)
40 | ```
41 | 
42 | ## Results
43 | 
44 | **Input & Output**
45 | 
46 | <p float="left">
47 | <img src="/bill7.png" width="350" height="600">
48 | <img src="/output.png" width="350" height="600">
49 | </p>
50 | 
51 | **Table**
52 | 
53 | - After saving to csv the result looks like the following
54 | 
55 | | LABEL | TEXT                               |
56 | | ----- | ---------------------------------- |
57 | | title | CREDIT CARD VOUCHER ANY RESTAURANT |
58 | | title | ANYWHERE                           |
59 | | key   | DATE:                              |
60 | | value | 02/02/2014                         |
61 | | key   | TIME:                              |
62 | | value | 11:11                              |
63 | | key   | CARD                               |
64 | | key   | TYPE:                              |
65 | | value | MC                                 |
66 | | key   | ACCT:                              |
67 | | value | XXXX XXXX XXXX                     |
68 | | value | 1111                               |
69 | | key   | TRANS                              |
70 | | key   | KEY:                               |
71 | | value | HYU8789798234                      |
72 | | key   | AUTH                               |
73 | | key   | CODE:                              |
74 | | value | 12345                              |
75 | | key   | EXP                                |
76 | | key   | DATE:                              |
77 | | value | XX/XX                              |
78 | | key   | CHECK:                             |
79 | | value | 1111                               |
80 | | key   | TABLE:                             |
81 | | value | 11/11                              |
82 | | key   | SERVER:                            |
83 | | value | 34                                 |
84 | | value | MONIKA                             |
85 | | key   | Subtotal:                          |
86 | | value | $1969                              |
87 | | value | .69                                |
88 | | key   | Gratuity: Total:                   |
89 | 
90 | ## Code credits
91 | 
92 | [@HuggingFace](https://huggingface.co/)
93 | 
94 | - Please note that this is still in development phase and will be improved in the near future
95 | 


--------------------------------------------------------------------------------
/bill7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vishnunkumar/doc_transformers/ea52976d5232caff4ce13758309b0bfac57664c0/bill7.png


--------------------------------------------------------------------------------
/doc_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/doc_transformers/parser.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import pytesseract
  4 | import torch
  5 | from itertools import groupby
  6 | from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification, LayoutLMv2FeatureExtractor
  7 | from datasets import load_dataset
  8 | from PIL import Image, ImageDraw, ImageFont
  9 | 
 10 | def load_tags():
 11 | 
 12 |   datasets = load_dataset("nielsr/funsd")
 13 |   labels = datasets['train'].features['ner_tags'].feature.names
 14 |   
 15 |   return labels
 16 | 
 17 | def load_models():
 18 |   
 19 |   feature_extractor = LayoutLMv2FeatureExtractor("microsoft/layoutlmv3-base", apply_ocr=True)
 20 |   processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
 21 |   model = LayoutLMv3ForTokenClassification.from_pretrained("nielsr/layoutlmv3-finetuned-funsd")
 22 |   
 23 |   return feature_extractor, processor, model
 24 | 
 25 | def load_image(path):
 26 |   
 27 |   image = Image.open(path).convert("RGB")
 28 |   return image
 29 | 
 30 | def unnormalize_box(bbox, width, height):
 31 |  
 32 |   return [
 33 |          width * (bbox[0] / 1000),
 34 |          height * (bbox[1] / 1000),
 35 |          width * (bbox[2] / 1000),
 36 |          height * (bbox[3] / 1000),
 37 |      ]
 38 | 
 39 | def iob_to_label(label):
 40 |   
 41 |   label = label[2:]
 42 |   if not label:
 43 |     return 'other'
 44 |   return label
 45 | 
 46 | 
 47 | def process_image(image, feature_extractor, processor, model, labels):
 48 |   
 49 |   id2label = {v: k for v, k in enumerate(labels)}
 50 |   label2id = {k: v for v, k in enumerate(labels)}
 51 |   width, height = image.size
 52 | 
 53 |   encods = feature_extractor(image, return_tensors="pt")
 54 |   encoding = processor(image, truncation=True, return_offsets_mapping=True, return_tensors="pt")
 55 |   offset_mapping = encoding.pop("offset_mapping")
 56 | 
 57 |   outputs = model(**encoding)
 58 |   predictions = outputs.logits.argmax(-1).squeeze().tolist()
 59 |   token_boxes = encoding.bbox.squeeze().tolist()
 60 | 
 61 |   is_subword = np.array(offset_mapping.squeeze().tolist())[:, 0] != 0
 62 |   true_predictions = [id2label[pred] for idx, pred in enumerate(predictions) if not is_subword[idx]]
 63 |   true_boxes = [unnormalize_box(box, width, height) for idx, box in enumerate(token_boxes) if not is_subword[idx]]
 64 | 
 65 |   words = encods.words
 66 |   n = len(true_predictions) - len(words[0])
 67 |   k = int(n/2)
 68 |   true_predictions = true_predictions[k:-k]
 69 |   true_boxes = true_boxes[k:-k]
 70 |   
 71 |   l_words = []
 72 |   preds = []
 73 |   bboxes = []
 74 |   key_pairs = []
 75 | 
 76 |   for i in range(0, len(words[0])):
 77 |     json_dict = {}
 78 |     if true_predictions[i] not in ["O"]:
 79 |       if true_predictions[i] in ["B-HEADER", "I-HEADER"]:
 80 |         json_dict["label"] = "TITLE"
 81 |       elif true_predictions[i] in ["B-QUESTION", "I-QUESTION"]:
 82 |         json_dict["label"] = "KEY"
 83 |       else:
 84 |         json_dict["label"] = "VALUE"
 85 |       json_dict["value"] = words[0][i]
 86 |       key_pairs.append(json_dict)
 87 |       bboxes.append(true_boxes[i])
 88 |   
 89 |   return key_pairs, bboxes
 90 |   
 91 | def visualize_image(image, key_pairs, bboxes):
 92 | 
 93 |   draw = ImageDraw.Draw(image)
 94 |   font = ImageFont.load_default()
 95 |   label2color = {'KEY':'blue', 'VALUE':'green', 'TITLE':'orange'}
 96 |   
 97 |   for kp, box in enumerate(zip(key_pairs, bboxes)):
 98 |     draw.rectangle(box, outline=label2color[kp['label']])
 99 |     draw.text((box[0] + 10, box[1] - 10), text=kp['label'], fill=label2color[predicted_label], font=font)
100 | 
101 |   return image
102 | 


--------------------------------------------------------------------------------
/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vishnunkumar/doc_transformers/ea52976d5232caff4ce13758309b0bfac57664c0/output.png


--------------------------------------------------------------------------------
/result.csv:
--------------------------------------------------------------------------------
 1 | LABEL,TEXT
 2 | title,CREDIT CARD VOUCHER ANY RESTAURANT
 3 | title,ANYWHERE
 4 | key,DATE:
 5 | value,02/02/2014
 6 | key,TIME:
 7 | value,11:11
 8 | key,CARD
 9 | key,TYPE:
10 | value,MC
11 | key,ACCT:
12 | value,XXXX XXXX XXXX
13 | value,1111
14 | key,TRANS
15 | key,KEY:
16 | value,HYU8789798234
17 | key,AUTH
18 | key,CODE:
19 | value,12345
20 | key,EXP
21 | key,DATE:
22 | value,XX/XX
23 | key,CHECK:
24 | value,1111
25 | key,TABLE:
26 | value,11/11
27 | key,SERVER:
28 | value,34
29 | value,MONIKA
30 | key,Subtotal:
31 | value,$1969
32 | value,.69
33 | key,Gratuity: Total:
34 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # Inside of setup.cfg
2 | [metadata]
3 | description-file = README.md
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | requirements = [            
 7 |   'transformers',
 8 |   'datasets',
 9 |   'pytesseract',
10 |   'pandas',
11 |   'numpy'
12 | ]
13 | 
14 | 
15 | setuptools.setup(
16 |     name="doc_transformers",
17 |     version="1.0.2",
18 |     author="Vishnu Nandakumar",
19 |     author_email="nkumarvishnu25@gmail.com",
20 |     description="Deep learning for document processing",
21 |     long_description=long_description,
22 |     long_description_content_type="text/markdown",
23 |     url = 'https://github.com/Vishnunkumar/doc_transformers/',
24 |     packages=[
25 |         'doc_transformers',
26 |     ],
27 |     package_dir={'doc_transformers': 'doc_transformers'},
28 |     package_data={
29 |         'doc_transformers': ['doc_transformers/*.py']
30 |     },
31 |     install_requires=requirements,
32 |     license="MIT license",
33 |     zip_safe=False,
34 |     keywords='doc_transformers',
35 |     classifiers=(
36 |     'Development Status :: 3 - Alpha',      # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
37 |     'Intended Audience :: Developers',      # Define that your audience are developers
38 |     'Topic :: Software Development :: Build Tools',
39 |     'License :: OSI Approved :: MIT License',   # Again, pick a license
40 |     'Programming Language :: Python :: 3',      #Specify which pyhton versions that you want to support
41 |     'Programming Language :: Python :: 3.4',
42 |     'Programming Language :: Python :: 3.5',
43 |     'Programming Language :: Python :: 3.6',
44 |     ),
45 | )
46 | 


--------------------------------------------------------------------------------