├── .gitignore
├── README.md
├── bin
    └── performOcr.py
├── gocr.env
├── install.sh
├── requirements.txt
├── samples
    ├── pdf
    │   └── shakespeares-2-pages.pdf
    └── sample.json
└── wget-install.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | gocr.env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | /out/
162 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GOKRa PDF (Google-OCR-a)
 2 | 
 3 | This is a file processor for PDFs. It is intended to be run as a cron job in order to batch process PDF files through 
 4 | Google's Document AI and overlay the OCR text ontop of the original. The PDF page images are re-encoded. Some image
 5 | quality loss may occur compared to the original.
 6 | 
 7 | This uses Google's Document AI's synchronous API and is limited to PDFs of only 15 pages.
 8 | 
 9 | # Potential Improvements
10 | * There is not attempt to match font style. Google's API does not return font information.
11 | * Text alignment is not perfect. It is good enough for my usage, but could be improved with some investment
12 | * If the input PDF already has text on a page, copy the source page directly. 
13 | * Google seems to rotate the PDF image. This may have side effects. There is no handling for this one way or another
14 | * Create a watchdog to ingest once file is added to a directory
15 | 
16 | This was a weekend project. I did not think about long-term, but if you want to submit a 
17 | pull request to make this tool better, I will consider them on a case by case basis. 
18 | 
19 | # Installation
20 | ```
21 | cd whereever you want this
22 | git clone <this repo>
23 | add ./bin folder to your path
24 | ``` 
25 | 
26 | Create new VENV
27 | ```
28 | pip install -r requirements.txt
29 | python -m venv venv
30 | source venv/bin/activate
31 | touch ~/.gocr.env
32 | sudo vi ~/.gocr.env
33 | ```
34 | 
35 | Paste the contents and fill in with meaningful values for your cloud setup:
36 | ```
37 | GOOGLE_APPLICATION_CREDENTIALS="/path/to/my/key.json"
38 | PROJECT_ID = "MY_GOOGLE_PROJECT_ID"
39 | PROCESSOR_ID = "MY_DOCUMENTAI_PROCESSOR_ID"  # Create processor in Cloud Console
40 | LOCATION = "us"  # Format is 'us' or 'eu'
41 | ```
42 | 


--------------------------------------------------------------------------------
/bin/performOcr.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import io
  3 | import os
  4 | import sys
  5 | from io import BytesIO
  6 | from pathlib import Path
  7 | 
  8 | import fitz
  9 | import magic
 10 | from PIL import Image
 11 | from dotenv import load_dotenv
 12 | from google.api_core.client_options import ClientOptions
 13 | from google.cloud import documentai
 14 | 
 15 | load_dotenv(dotenv_path=os.path.join(Path.home(), ".gocr.env"))  # take environment variables from gocr.env.
 16 | 
 17 | # ARG 0 is the current script
 18 | # ARG 1 will be source directory
 19 | # ARG 2 will be destionation directory
 20 | SOURCE_DIR = os.path.join(os.getcwd(), sys.argv[1])
 21 | DEST_DIR = os.path.join(os.getcwd(), sys.argv[2])
 22 | print("USING SOURCE: " + SOURCE_DIR)
 23 | print("USING DEST: " + DEST_DIR)
 24 | 
 25 | # GOOGLE CLOUD Settings
 26 | PROJECT_ID = os.environ['PROJECT_ID']
 27 | LOCATION = os.environ['LOCATION']  # Format is 'us' or 'eu'
 28 | PROCESSOR_ID = os.environ['PROCESSOR_ID']  # Create processor in Cloud Console
 29 | 
 30 | # Instantiates a client
 31 | docai_client = documentai.DocumentProcessorServiceClient(
 32 |     client_options=ClientOptions(api_endpoint=f"{LOCATION}-documentai.googleapis.com")
 33 | )
 34 | 
 35 | 
 36 | def perform_ocr(input_file):
 37 |     print("Performing OCR on: " + input_file)
 38 |     source_mime_type = magic.from_file(input_file, mime=True)
 39 |     name = docai_client.processor_path(PROJECT_ID, LOCATION, PROCESSOR_ID)
 40 |     with open(input_file, "rb") as file:
 41 |         file_content = file.read()
 42 |         raw_document = documentai.RawDocument(content=file_content, mime_type=source_mime_type)
 43 |         request = documentai.ProcessRequest(name=name, raw_document=raw_document)
 44 |         response = docai_client.process_document(request=request)
 45 |         return response.document
 46 | 
 47 | 
 48 | def calculate_line_height(rectHeight, text):
 49 |     totallines = text.count('\n')
 50 |     lineheight = rectHeight/totallines
 51 |     return lineheight
 52 | 
 53 | 
 54 | def calculate_font_point_size(area_height_pixels, text):
 55 |     line_height = calculate_line_height(area_height_pixels, text)
 56 |     fontpointheight = line_height * 0.80
 57 |     return fontpointheight
 58 | 
 59 | 
 60 | def process_ocr_result(outputDocument, ocr_result):
 61 |     img_xref = 0
 62 |     ocrText = ocrResult.text
 63 |     pageNum = 0
 64 |     for ocrPage in ocrResult.pages:
 65 |         width = ocrPage.dimension.width
 66 |         height = ocrPage.dimension.height
 67 |         rect = fitz.Rect(0, 0, width, height)
 68 |         pdfPage = outputDocument.new_page(-1, width, height)
 69 | 
 70 |         # Re-encode the OCR Page image as a more compressed image for writing
 71 |         largeImageBytes = Image.open(BytesIO(ocrPage.image.content))
 72 |         compressedImageBytes = io.BytesIO()
 73 |         largeImageBytes.save(compressedImageBytes, format="JPEG", optimize=True, compress_level=9, quality=60)
 74 |         Image.open(BytesIO(compressedImageBytes.getvalue()))
 75 | 
 76 |         pdfPage.insert_image(rect, stream=compressedImageBytes)
 77 |         for block in ocrPage.blocks:
 78 |             for textSegment in block.layout.text_anchor.text_segments:
 79 |                 startIndexInt = int(textSegment.start_index)
 80 |                 endIndexInt = int(textSegment.end_index)
 81 |                 textString = ocrText[startIndexInt:endIndexInt:1]
 82 | 
 83 |                 verticesArray = block.layout.bounding_poly.vertices
 84 |                 upperLeft = fitz.Point(verticesArray[0].x, verticesArray[0].y)
 85 |                 upperRight = fitz.Point(verticesArray[1].x, verticesArray[1].y)
 86 |                 lowerRight = fitz.Point(verticesArray[2].x, verticesArray[2].y)
 87 |                 lowerLeft = fitz.Point(verticesArray[3].x, verticesArray[3].y)
 88 | 
 89 |                 rect = fitz.Rect(upperLeft, lowerRight)
 90 |                 fontSize = calculate_font_point_size(rect.height, textString)
 91 |                 lineHeightPixels = calculate_line_height(rect.height, textString)
 92 | 
 93 |                 textMetric = "S:" + str(fontSize) + " N:" + str(textString.count('\n')) + " H:" + str(lineHeightPixels)
 94 |                 # For Debugging. Draw Rectangles on OCR blocks and calculated text offsets
 95 |                 # pdfPage.draw_rect(rect=rect, color=getColor("RED"))
 96 |                 # pdfPage.insert_text((rect.x0, rect.y0+lineHeightPixels), textMetric, fontname="courier",
 97 |                 # fontsize=fontSize)
 98 |                 descentOffset = fitz.Font("courier").descender * lineHeightPixels
 99 |                 ascentOffset = fitz.Font("courier").ascender * lineHeightPixels
100 |                 pdfPage.insert_text((rect.x0, rect.y0+lineHeightPixels+descentOffset), textString, fontname="courier", fontsize=fontSize,
101 |                                     fill_opacity=0, stroke_opacity=0)
102 |                 pageNum += 1
103 |     return outputDocument
104 | 
105 | 
106 | # START Main Script Section
107 | for filename in os.listdir(SOURCE_DIR):
108 |     filepath = os.path.join(SOURCE_DIR,filename)
109 |     outputFileName = os.path.join(DEST_DIR, filename)
110 |     if (os.path.exists(outputFileName)):
111 |         print(outputFileName + " exists. Skipping.")
112 |     else:
113 |         ocrResult = perform_ocr(filepath)
114 |         outputDocument = process_ocr_result(fitz.open(),ocrResult)
115 |         outputDocument.ez_save(outputFileName)
116 |         os.remove(filepath)
117 | # END Main Script Section
118 | 
119 | 
120 | print("Document processing complete.")
121 | 


--------------------------------------------------------------------------------
/gocr.env:
--------------------------------------------------------------------------------
1 | #THIS IS A SAMPLE
2 | GOOGLE_APPLICATION_CREDENTIALS="/path/to/my/key.json"
3 | PROJECT_ID = "MY_GOOGLE_PROJECT_ID"
4 | PROCESSOR_ID = "MY_DOCUMENTAI_PROCESSOR_ID"  # Create processor in Cloud Console
5 | LOCATION = "us"  # Format is 'us' or 'eu'
6 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #This script is intended to be run by first doing a wget on this file and executing it locally
 4 | #it will clone the git repo and copy over a sample environment file to your home directory
 5 | 
 6 | sudo apt install python3.10-venv python3 virtualenv pip git
 7 | git clone git@github.com:javanator/gocr.git
 8 | 
 9 | pushd gocr
10 | python3 -m venv venv && source venv/bin/activate && pip install -r requirements.txt
11 | if ! [ -f $HOME/.gocr.env ]; then
12 |   cp gocr.env ~/.gocr.env
13 |   echo "No ENV file found. Sample Copied"
14 |   echo "Edit $HOME/.gocr.env prior to use"
15 | fi
16 | popd
17 | 
18 | echo "Installation complete"
19 | echo "manually source $HOME/gocr/venv/activate prior to use"
20 | echo "manually Add $HOME/gocr/bin to PATH"
21 | 
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | google-cloud-core~=2.4.1
 2 | google-cloud-documentai~=2.21.1
 3 | google-cloud-storage~=2.14.0
 4 | google-cloud-documentai-toolbox~=0.12.0a0
 5 | simplejson~=3.19.2
 6 | python-dotenv~=1.0.1
 7 | hocr-tools~=1.1.1
 8 | Pillow~=10.2.0
 9 | pathlib
10 | PyMuPDF~=1.23.19
11 | reportlab~=4.0.9
12 | python-magic~=0.4.27
13 | virtualenv~=20.25.0


--------------------------------------------------------------------------------
/samples/pdf/shakespeares-2-pages.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/javanator/gocr/e109395c9f18bd13614ecd9ec90ed028b0503b4a/samples/pdf/shakespeares-2-pages.pdf


--------------------------------------------------------------------------------
/wget-install.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/javanator/gocr/e109395c9f18bd13614ecd9ec90ed028b0503b4a/wget-install.sh


--------------------------------------------------------------------------------