├── docs
    ├── usage.rst
    ├── authors.rst
    ├── history.rst
    ├── readme.rst
    ├── contributing.rst
    ├── modules.rst
    ├── Makefile
    ├── make.bat
    ├── google_drive_ocr.rst
    ├── installation.rst
    ├── index.rst
    └── conf.py
├── tests
    ├── __init__.py
    └── test_google_drive_ocr.py
├── requirements.txt
├── HISTORY.rst
├── requirements_dev.txt
├── AUTHORS.rst
├── MANIFEST.in
├── .editorconfig
├── .github
    ├── ISSUE_TEMPLATE.md
    ├── ISSUE_TEMPLATE
    │   ├── QUESTION.md
    │   ├── BUG_REPORT.md
    │   ├── DOCUMENTATION.md
    │   └── FEATURE_REQUEST.md
    └── PULL_REQUEST_TEMPLATE.md
├── google_drive_ocr
    ├── __init__.py
    ├── utils.py
    ├── errors.py
    ├── cli.py
    └── application.py
├── setup.cfg
├── tox.ini
├── .travis.yml
├── LICENSE
├── .gitignore
├── setup.py
├── Makefile
├── USAGE.rst
├── CONTRIBUTING.rst
└── README.rst


/docs/usage.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../USAGE.rst


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../HISTORY.rst
2 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit test package for google_drive_ocr."""
2 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | google_drive_ocr
2 | ================
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    google_drive_ocr
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | google_auth_oauthlib>=0.4.1
2 | google_api_python_client>=2.9.0
3 | tqdm>=4.60.0
4 | natsort>=7.0.1
5 | pdf2image>=1.15.1
6 | ConfigArgParse>=1.4.1
7 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | History
 2 | =======
 3 | 
 4 | 0.2.0 (2021-06-29)
 5 | ------------------
 6 | 
 7 | * PDF file support
 8 | 
 9 | 0.1.0 (2021-06-14)
10 | ------------------
11 | 
12 | * First release on PyPI.
13 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
 1 | pip==19.2.3
 2 | bump2version==0.5.11
 3 | wheel==0.33.6
 4 | watchdog==0.9.0
 5 | flake8==3.7.8
 6 | tox==3.14.0
 7 | coverage==4.5.4
 8 | Sphinx==1.8.5
 9 | twine==1.14.0
10 | 
11 | pytest==6.2.4
12 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | * Hrishikesh Terdalkar <hrishikeshrt@linuxmail.org>
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | None yet. Why not be the first?
14 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.rst
 4 | include LICENSE
 5 | include README.rst
 6 | 
 7 | recursive-include tests *
 8 | recursive-exclude * __pycache__
 9 | recursive-exclude * *.py[co]
10 | 
11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
12 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 | 
17 | [LICENSE]
18 | insert_final_newline = false
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | * Google OCR (Drive API v3) version:
 2 | * Python version:
 3 | * Operating System:
 4 | 
 5 | ### Description
 6 | 
 7 | Describe what you were trying to get done.
 8 | Tell us what happened, what went wrong, and what you expected to happen.
 9 | 
10 | ### What I Did
11 | 
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/QUESTION.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Use this template if you have a specific doubt
 4 | title: "[QUESTION]"
 5 | labels: enhancement
 6 | assignees: ''
 7 | ---
 8 | 
 9 | * google-drive-ocr version:
10 | * Python version:
11 | * Operating System:
12 | 
13 | ### Question
14 | 
15 | Ask your question in a clear and concise manner.
16 | Mention if you have made any attempts to find the answer.
17 | 
18 | 


--------------------------------------------------------------------------------
/tests/test_google_drive_ocr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Tests for `google_drive_ocr` package."""
 4 | 
 5 | import pytest
 6 | 
 7 | # from google_drive_ocr.application import GoogleOCRApplication
 8 | 
 9 | 
10 | @pytest.fixture
11 | def response():
12 |     """Sample pytest fixture.
13 | 
14 |     See more at: http://doc.pytest.org/en/latest/fixture.html
15 |     """
16 |     pass
17 | 
18 | 
19 | def test_create_application():
20 |     pass
21 | 


--------------------------------------------------------------------------------
/google_drive_ocr/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """Google OCR (Drive API v3)."""
 4 | 
 5 | ###############################################################################
 6 | 
 7 | __author__ = """Hrishikesh Terdalkar"""
 8 | __email__ = 'hrishikeshrt@linuxmail.org'
 9 | __version__ = '0.2.6'
10 | 
11 | ###############################################################################
12 | 
13 | from .application import GoogleOCRApplication  # noqa
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/BUG_REPORT.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Use this template for reporting a bug
 4 | title: "BUG: [INFORMATIVE TITLE]"
 5 | labels: bug
 6 | assignees: ''
 7 | ---
 8 | 
 9 | * google-drive-ocr version:
10 | * Python version:
11 | * Operating System:
12 | 
13 | ### Description
14 | 
15 | Describe what you were trying to get done.
16 | Tell us what happened, what went wrong, and what you expected to happen.
17 | 
18 | ### What I Did
19 | 
20 | ```
21 | Paste the command(s) you ran and the output.
22 | If there was a crash, please include the traceback here.
23 | ```
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/DOCUMENTATION.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation
 3 | about: Use this template for suggesting or requesting a change or an addition to the documentation
 4 | title: "[INFORMATIVE TITLE]"
 5 | labels: documentation
 6 | assignees: ''
 7 | ---
 8 | 
 9 | * google-drive-ocr version:
10 | * Python version:
11 | * Operating System:
12 | 
13 | ### Description
14 | 
15 | Mention the module, class or specific location that you think requires a change or an addition in the documentation.
16 | 
17 | ### Documentation
18 | 
19 | If you can, please provide a sample documentation text.
20 | 
21 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.2.6
 3 | commit = True
 4 | tag = True
 5 | message = build(release): bump version {current_version} → {new_version}
 6 | tag_message = build(release): bump version {current_version} → {new_version}
 7 | 
 8 | [bumpversion:file:setup.py]
 9 | search = version='{current_version}'
10 | replace = version='{new_version}'
11 | 
12 | [bumpversion:file:google_drive_ocr/__init__.py]
13 | search = __version__ = '{current_version}'
14 | replace = __version__ = '{new_version}'
15 | 
16 | [bdist_wheel]
17 | universal = 1
18 | 
19 | [flake8]
20 | exclude = docs
21 | 
22 | [tool:pytest]
23 | collect_ignore = ['setup.py']
24 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36, py37, py38, flake8
 3 | 
 4 | [travis]
 5 | python =
 6 |     3.8: py38
 7 |     3.7: py37
 8 |     3.6: py36
 9 | 
10 | [testenv:flake8]
11 | basepython = python
12 | deps = flake8
13 | commands = flake8 google_drive_ocr tests
14 | 
15 | [testenv]
16 | setenv =
17 |     PYTHONPATH = {toxinidir}
18 | deps =
19 |     -r{toxinidir}/requirements_dev.txt
20 | ; If you want to make tox run the tests with the same versions, create a
21 | ; requirements.txt with the pinned versions and uncomment the following line:
22 | ;     -r{toxinidir}/requirements.txt
23 | commands =
24 |     pip install -U pip
25 |     pytest --basetemp={envtmpdir}
26 | 
27 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = google_drive_ocr
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/FEATURE_REQUEST.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature Request
 3 | about: Use this template for requesting new features
 4 | title: "[FEATURE NAME]"
 5 | labels: enhancement
 6 | assignees: ''
 7 | ---
 8 | 
 9 | * google-drive-ocr version:
10 | * Python version:
11 | * Operating System:
12 | 
13 | 
14 | ### Problem
15 | 
16 | If your feature request is related to a problem, please describe the problem in a clear and concise manner. 
17 | 
18 | ### Feature Description
19 | 
20 | Describe in detail the feature or functionality that you want added.
21 | You may also describe any alternative solutions or features you've considered.
22 | 
23 | ### Reasons
24 | 
25 | Make a case for why this feature would improve the quality of the project.
26 | 
27 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Config file for automatic testing at travis-ci.com
 2 | 
 3 | language: python
 4 | python:
 5 |   - 3.8
 6 |   - 3.7
 7 |   - 3.6
 8 | 
 9 | # Command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
10 | install: pip install -U tox-travis
11 | 
12 | # Command to run tests, e.g. python setup.py test
13 | script: tox
14 | 
15 | # Assuming you have installed the travis-ci CLI tool, after you
16 | # create the Github repo and add it to Travis, run the
17 | # following command to finish PyPI deployment setup:
18 | # $ travis encrypt --add deploy.password
19 | deploy:
20 |   provider: pypi
21 |   distributions: sdist bdist_wheel
22 |   user: hrishikeshrt
23 |   password:
24 |     secure: PLEASE_REPLACE_ME
25 |   on:
26 |     tags: true
27 |     repo: hrishikeshrt/google_drive_ocr
28 |     python: 3.8
29 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=google_drive_ocr
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/google_drive_ocr.rst:
--------------------------------------------------------------------------------
 1 | google\_drive\_ocr package
 2 | ==========================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | google\_drive\_ocr.application module
 8 | -------------------------------------
 9 | 
10 | .. automodule:: google_drive_ocr.application
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | google\_drive\_ocr.cli module
16 | -----------------------------
17 | 
18 | .. automodule:: google_drive_ocr.cli
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | google\_drive\_ocr.errors module
24 | --------------------------------
25 | 
26 | .. automodule:: google_drive_ocr.errors
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | google\_drive\_ocr.utils module
32 | -------------------------------
33 | 
34 | .. automodule:: google_drive_ocr.utils
35 |    :members:
36 |    :undoc-members:
37 |    :show-inheritance:
38 | 
39 | Module contents
40 | ---------------
41 | 
42 | .. automodule:: google_drive_ocr
43 |    :members:
44 |    :undoc-members:
45 |    :show-inheritance:
46 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--- Provide a general summary of your changes in the Title above -->
 2 | 
 3 | ## Description
 4 | <!--- Describe your changes in detail -->
 5 | 
 6 | ## Motivation and Context
 7 | <!--- Why is this change required? What problem does it solve? -->
 8 | <!--- If it fixes an open issue, please link to the issue here. -->
 9 | 
10 | ## How has this been tested?
11 | <!--- Please describe in detail how you tested your changes. -->
12 | <!--- Include details of your testing environment, tests ran to see how -->
13 | <!--- your change affects other areas of the code, etc. -->
14 | 
15 | ## Screenshots (if appropriate):
16 | 
17 | ## Types of changes
18 | <!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->
19 | - [ ] Bug fix (non-breaking change which fixes an issue)
20 | - [ ] New feature (non-breaking change which adds functionality)
21 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
22 | 
23 | ## Checklist:
24 | <!--- Go over all the following points, and put an `x` in all the boxes that apply. -->
25 | <!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->
26 | - [ ] My code follows the code style of this project.
27 | - [ ] My change requires a change to the documentation.
28 | - [ ] I have updated the documentation accordingly.
29 | 
30 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | .. highlight:: shell
 2 | 
 3 | ============
 4 | Installation
 5 | ============
 6 | 
 7 | 
 8 | Stable release
 9 | --------------
10 | 
11 | To install Google OCR (Drive API v3), run this command in your terminal:
12 | 
13 | .. code-block:: console
14 | 
15 |     $ pip install google_drive_ocr
16 | 
17 | This is the preferred method to install Google OCR (Drive API v3), as it will always install the most recent stable release.
18 | 
19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide
20 | you through the process.
21 | 
22 | .. _pip: https://pip.pypa.io
23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
24 | 
25 | 
26 | From sources
27 | ------------
28 | 
29 | The sources for Google OCR (Drive API v3) can be downloaded from the `Github repo`_.
30 | 
31 | You can either clone the public repository:
32 | 
33 | .. code-block:: console
34 | 
35 |     $ git clone git://github.com/hrishikeshrt/google_drive_ocr
36 | 
37 | Or download the `tarball`_:
38 | 
39 | .. code-block:: console
40 | 
41 |     $ curl -OJL https://github.com/hrishikeshrt/google_drive_ocr/tarball/master
42 | 
43 | Once you have a copy of the source, you can install it with:
44 | 
45 | .. code-block:: console
46 | 
47 |     $ python setup.py install
48 | 
49 | 
50 | .. _Github repo: https://github.com/hrishikeshrt/google_drive_ocr
51 | .. _tarball: https://github.com/hrishikeshrt/google_drive_ocr/tarball/master
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | GNU GENERAL PUBLIC LICENSE
 2 |                       Version 3, 29 June 2007
 3 | 
 4 |     Perform OCR using Google's Drive API v3
 5 |     Copyright (C) 2021  Hrishikesh Terdalkar
 6 | 
 7 |     This program is free software: you can redistribute it and/or modify
 8 |     it under the terms of the GNU General Public License as published by
 9 |     the Free Software Foundation, either version 3 of the License, or
10 |     (at your option) any later version.
11 | 
12 |     This program is distributed in the hope that it will be useful,
13 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
14 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 |     GNU General Public License for more details.
16 | 
17 |     You should have received a copy of the GNU General Public License
18 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 | 
20 | Also add information on how to contact you by electronic and paper mail.
21 | 
22 |   You should also get your employer (if you work as a programmer) or school,
23 | if any, to sign a "copyright disclaimer" for the program, if necessary.
24 | For more information on this, and how to apply and follow the GNU GPL, see
25 | <http://www.gnu.org/licenses/>.
26 | 
27 |   The GNU General Public License does not permit incorporating your program
28 | into proprietary programs.  If your program is a subroutine library, you
29 | may consider it more useful to permit linking proprietary applications with
30 | the library.  If this is what you want to do, use the GNU Lesser General
31 | Public License instead of this License.  But first, please read
32 | <http://www.gnu.org/philosophy/why-not-lgpl.html>.
33 | 
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # dotenv
 84 | .env
 85 | 
 86 | # virtualenv
 87 | .venv
 88 | venv/
 89 | ENV/
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/
103 | 
104 | # IDE settings
105 | .vscode/


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to Google OCR (Drive API v3)'s documentation!
 2 | =====================================================
 3 | 
 4 | .. image:: https://img.shields.io/pypi/v/google_drive_ocr?color=success
 5 |         :target: https://pypi.python.org/pypi/google_drive_ocr
 6 | 
 7 | .. image:: https://readthedocs.org/projects/google-drive-ocr/badge/?version=latest
 8 |         :target: https://google-drive-ocr.readthedocs.io/en/latest/?version=latest
 9 |         :alt: Documentation Status
10 | 
11 | .. image:: https://img.shields.io/pypi/pyversions/google_drive_ocr
12 |         :target: https://pypi.python.org/pypi/google_drive_ocr
13 |         :alt: Python Version Support
14 | 
15 | .. image:: https://img.shields.io/github/issues/hrishikeshrt/google_drive_ocr
16 |         :target: https://github.com/hrishikeshrt/google_drive_ocr/issues
17 |         :alt: GitHub Issues
18 | 
19 | .. image:: https://img.shields.io/github/followers/hrishikeshrt?style=social
20 |         :target: https://github.com/hrishikeshrt
21 |         :alt: GitHub Followers
22 | 
23 | .. image:: https://img.shields.io/twitter/follow/hrishikeshrt?style=social
24 |         :target: https://twitter.com/hrishikeshrt
25 |         :alt: Twitter Followers
26 | 
27 | 
28 | Perform OCR using Google's Drive API v3
29 | 
30 | 
31 | * Free software: GNU General Public License v3
32 | * Documentation: https://google-drive-ocr.readthedocs.io.
33 | 
34 | Features
35 | ========
36 | 
37 | * Perform OCR using Google's Drive API v3
38 | * Class :code:`GoogleOCRApplication()` for use in projects
39 | * Highly configurable CLI
40 | * Run OCR on a single image file
41 | * Run OCR on multiple image files
42 | * Run OCR on all images in directory
43 | * Use multiple workers (:code:`multiprocessing`)
44 | * Work on a PDF document directly
45 | 
46 | 
47 | .. toctree::
48 |    :maxdepth: 2
49 |    :caption: Contents:
50 | 
51 |    readme
52 |    installation
53 |    usage
54 |    modules
55 |    contributing
56 |    authors
57 |    history
58 | 
59 | Indices and tables
60 | ==================
61 | * :ref:`genindex`
62 | * :ref:`modindex`
63 | * :ref:`search`
64 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """The setup script."""
 4 | 
 5 | from setuptools import setup, find_packages
 6 | 
 7 | with open('README.rst') as readme_file:
 8 |     readme = readme_file.read()
 9 | 
10 | with open('HISTORY.rst') as history_file:
11 |     history = history_file.read()
12 | 
13 | requirements = [
14 |     'google_api_python_client>=2.9.0',
15 |     'google_auth_oauthlib>=0.4.1',
16 |     'tqdm>=4.60.0',
17 |     'natsort>=7.0.1',
18 |     'pdf2image>=1.15.1',
19 |     'ConfigArgParse>=1.4.1'
20 | ]
21 | 
22 | test_requirements = ['pytest>=3', ]
23 | 
24 | setup(
25 |     author="Hrishikesh Terdalkar",
26 |     author_email='hrishikeshrt@linuxmail.org',
27 |     python_requires='>=3.6',
28 |     classifiers=[
29 |         'Development Status :: 4 - Beta',
30 |         'Intended Audience :: Developers',
31 |         'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
32 |         'Natural Language :: English',
33 |         'Topic :: Software Development :: Libraries :: Python Modules',
34 |         'Topic :: Software Development :: User Interfaces',
35 |         'Topic :: Utilities',
36 |         'Programming Language :: Python :: 3',
37 |         'Programming Language :: Python :: 3.6',
38 |         'Programming Language :: Python :: 3.7',
39 |         'Programming Language :: Python :: 3.8',
40 |         'Programming Language :: Python :: 3.9'
41 |     ],
42 |     description="Perform OCR using Google's Drive API v3",
43 |     entry_points={
44 |         'console_scripts': [
45 |             'google-ocr=google_drive_ocr.cli:main',
46 |         ],
47 |     },
48 |     install_requires=requirements,
49 |     license="GNU General Public License v3",
50 |     long_description=readme + '\n\n' + history,
51 |     include_package_data=True,
52 |     keywords='google_drive_ocr',
53 |     name='google_drive_ocr',
54 |     packages=find_packages(include=['google_drive_ocr', 'google_drive_ocr.*']),
55 |     test_suite='tests',
56 |     tests_require=test_requirements,
57 |     url='https://github.com/hrishikeshrt/google_drive_ocr',
58 |     version='0.2.6',
59 |     zip_safe=False,
60 | )
61 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean clean-test clean-pyc clean-build docs help
 2 | .DEFAULT_GOAL := help
 3 | 
 4 | define BROWSER_PYSCRIPT
 5 | import os, webbrowser, sys
 6 | 
 7 | from urllib.request import pathname2url
 8 | 
 9 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
10 | endef
11 | export BROWSER_PYSCRIPT
12 | 
13 | define PRINT_HELP_PYSCRIPT
14 | import re, sys
15 | 
16 | for line in sys.stdin:
17 | 	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
18 | 	if match:
19 | 		target, help = match.groups()
20 | 		print("%-20s %s" % (target, help))
21 | endef
22 | export PRINT_HELP_PYSCRIPT
23 | 
24 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
25 | 
26 | help:
27 | 	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
28 | 
29 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
30 | 
31 | clean-build: ## remove build artifacts
32 | 	rm -fr build/
33 | 	rm -fr dist/
34 | 	rm -fr .eggs/
35 | 	find . -name '*.egg-info' -exec rm -fr {} +
36 | 	find . -name '*.egg' -exec rm -f {} +
37 | 
38 | clean-pyc: ## remove Python file artifacts
39 | 	find . -name '*.pyc' -exec rm -f {} +
40 | 	find . -name '*.pyo' -exec rm -f {} +
41 | 	find . -name '*~' -exec rm -f {} +
42 | 	find . -name '__pycache__' -exec rm -fr {} +
43 | 
44 | clean-test: ## remove test and coverage artifacts
45 | 	rm -fr .tox/
46 | 	rm -f .coverage
47 | 	rm -fr htmlcov/
48 | 	rm -fr .pytest_cache
49 | 
50 | lint: ## check style with flake8
51 | 	flake8 google_drive_ocr tests
52 | 
53 | test: ## run tests quickly with the default Python
54 | 	pytest
55 | 
56 | test-all: ## run tests on every Python version with tox
57 | 	tox
58 | 
59 | coverage: ## check code coverage quickly with the default Python
60 | 	coverage run --source google_drive_ocr -m pytest
61 | 	coverage report -m
62 | 	coverage html
63 | 	$(BROWSER) htmlcov/index.html
64 | 
65 | docs: ## generate Sphinx HTML documentation, including API docs
66 | 	rm -f docs/google_drive_ocr.rst
67 | 	rm -f docs/modules.rst
68 | 	sphinx-apidoc -o docs/ google_drive_ocr
69 | 	$(MAKE) -C docs clean
70 | 	$(MAKE) -C docs html
71 | 	$(BROWSER) docs/_build/html/index.html
72 | 
73 | servedocs: docs ## compile the docs watching for changes
74 | 	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
75 | 
76 | release: dist ## package and upload a release
77 | 	twine upload dist/*
78 | 
79 | dist: clean ## builds source and wheel package
80 | 	python setup.py sdist
81 | 	python setup.py bdist_wheel
82 | 	ls -l dist
83 | 
84 | install: clean ## install the package to the active Python's site-packages
85 | 	python setup.py install
86 | 


--------------------------------------------------------------------------------
/USAGE.rst:
--------------------------------------------------------------------------------
  1 | Usage
  2 | =====
  3 | 
  4 | Using in a Project
  5 | ------------------
  6 | 
  7 | Create a :code:`GoogleOCRApplication` application instance:
  8 | 
  9 | .. code-block:: python
 10 | 
 11 |     from google_drive_ocr import GoogleOCRApplication
 12 | 
 13 |     app = GoogleOCRApplication('client_secret.json')
 14 | 
 15 | Perform OCR on a single image:
 16 | 
 17 | .. code-block:: python
 18 | 
 19 |     app.perform_ocr('image.png')
 20 | 
 21 | 
 22 | Perform OCR on mupltiple images:
 23 | 
 24 | .. code-block:: python
 25 | 
 26 |     app.perform_ocr_batch(['image_1.png', 'image_2.png', 'image_3.png'])
 27 | 
 28 | Perform OCR on multiple images using multiple workers (:code:`multiprocessing`):
 29 | 
 30 | .. code-block:: python
 31 | 
 32 |     app.perform_ocr_batch(['image_1.png', 'image_3.png', 'image_2.png'], workers=2)
 33 | 
 34 | 
 35 | Using Command Line Interface
 36 | ----------------------------
 37 | 
 38 | Typical usage with several options:
 39 | 
 40 | .. code-block:: console
 41 | 
 42 |     google-ocr --client-secret client_secret.json \
 43 |     --upload-folder-id <google-drive-folder-id>  \
 44 |     --image-dir images/ --extension .jpg \
 45 |     --workers 4 --no-keep
 46 | 
 47 | Show help message with the full set of options:
 48 | 
 49 | .. code-block:: console
 50 | 
 51 |     google-ocr --help
 52 | 
 53 | Configuration
 54 | ^^^^^^^^^^^^^
 55 | 
 56 | The default location for configuration is :code:`~/.gdo.cfg`.
 57 | If configuration is written to this location with a set of options,
 58 | we don't have to specify those options again on the subsequent runs.
 59 | 
 60 | Save configuration and exit:
 61 | 
 62 | .. code-block:: console
 63 | 
 64 |     google-ocr --client-secret client_secret.json --write-config ~/.gdo.cfg
 65 | 
 66 | 
 67 | Read configuration from a custom location (if it was written to a custom location):
 68 | 
 69 | .. code-block:: console
 70 | 
 71 |     google-ocr --config ~/.my_config_file ..
 72 | 
 73 | Performing OCR
 74 | ^^^^^^^^^^^^^^
 75 | 
 76 | **Note**: It is assumed that the :code:`client-secret` option is saved in configuration file.
 77 | 
 78 | Single image file:
 79 | 
 80 | .. code-block:: console
 81 | 
 82 |     google-ocr -i image.png
 83 | 
 84 | Multiple image files:
 85 | 
 86 | .. code-block:: console
 87 | 
 88 |     google-ocr -b image_1.png image_2.png image_3.png
 89 | 
 90 | All image files from a directory with a specific extension:
 91 | 
 92 | .. code-block:: console
 93 | 
 94 |     google-ocr --image-dir images/ --extension .png
 95 | 
 96 | Multiple workers (:code:`multiprocessing`):
 97 | 
 98 | .. code-block:: console
 99 | 
100 |     google-ocr -b image_1.png image_2.png image_3.png --workers 2
101 | 
102 | PDF files:
103 | 
104 | .. code-block:: console
105 | 
106 |     google-ocr --pdf document.pdf --pages 1-3 5 7-10 13
107 | 
108 | 
109 | 
110 | **Note**:
111 | You must setup a Google application and download :code:`client_secrets.json` file before using :code:`google_drive_ocr`.
112 | 
113 | Setup Instructions
114 | ==================
115 | 
116 | Create a project on Google Cloud Platform
117 | 
118 | **Wizard**: https://console.developers.google.com/start/api?id=drive
119 | 
120 | **Instructions**:
121 | 
122 |     * https://cloud.google.com/genomics/downloading-credentials-for-api-access
123 |     * Select application type as "Installed Application"
124 |     * Create credentials OAuth consent screen --> OAuth client ID
125 |     * Save :code:`client_secret.json`
126 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | .. highlight:: shell
  2 | 
  3 | ============
  4 | Contributing
  5 | ============
  6 | 
  7 | Contributions are welcome, and they are greatly appreciated! Every little bit
  8 | helps, and credit will always be given.
  9 | 
 10 | You can contribute in many ways:
 11 | 
 12 | Types of Contributions
 13 | ----------------------
 14 | 
 15 | Report Bugs
 16 | ~~~~~~~~~~~
 17 | 
 18 | Report bugs at https://github.com/hrishikeshrt/google_drive_ocr/issues.
 19 | 
 20 | If you are reporting a bug, please include:
 21 | 
 22 | * Your operating system name and version.
 23 | * Any details about your local setup that might be helpful in troubleshooting.
 24 | * Detailed steps to reproduce the bug.
 25 | 
 26 | Fix Bugs
 27 | ~~~~~~~~
 28 | 
 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
 30 | wanted" is open to whoever wants to implement it.
 31 | 
 32 | Implement Features
 33 | ~~~~~~~~~~~~~~~~~~
 34 | 
 35 | Look through the GitHub issues for features. Anything tagged with "enhancement"
 36 | and "help wanted" is open to whoever wants to implement it.
 37 | 
 38 | Write Documentation
 39 | ~~~~~~~~~~~~~~~~~~~
 40 | 
 41 | Google OCR (Drive API v3) could always use more documentation, whether as part of the
 42 | official Google OCR (Drive API v3) docs, in docstrings, or even on the web in blog posts,
 43 | articles, and such.
 44 | 
 45 | Submit Feedback
 46 | ~~~~~~~~~~~~~~~
 47 | 
 48 | The best way to send feedback is to file an issue at https://github.com/hrishikeshrt/google_drive_ocr/issues.
 49 | 
 50 | If you are proposing a feature:
 51 | 
 52 | * Explain in detail how it would work.
 53 | * Keep the scope as narrow as possible, to make it easier to implement.
 54 | * Remember that this is a volunteer-driven project, and that contributions
 55 |   are welcome :)
 56 | 
 57 | Get Started!
 58 | ------------
 59 | 
 60 | Ready to contribute? Here's how to set up `google_drive_ocr` for local development.
 61 | 
 62 | 1. Fork the `google_drive_ocr` repo on GitHub.
 63 | 2. Clone your fork locally::
 64 | 
 65 |     $ git clone git@github.com:your_name_here/google_drive_ocr.git
 66 | 
 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
 68 | 
 69 |     $ mkvirtualenv google_drive_ocr
 70 |     $ cd google_drive_ocr/
 71 |     $ python setup.py develop
 72 | 
 73 | 4. Create a branch for local development::
 74 | 
 75 |     $ git checkout -b name-of-your-bugfix-or-feature
 76 | 
 77 |    Now you can make your changes locally.
 78 | 
 79 | 5. When you're done making changes, check that your changes pass flake8 and the
 80 |    tests, including testing other Python versions with tox::
 81 | 
 82 |     $ flake8 google_drive_ocr tests
 83 |     $ python setup.py test or pytest
 84 |     $ tox
 85 | 
 86 |    To get flake8 and tox, just pip install them into your virtualenv.
 87 | 
 88 | 6. Commit your changes and push your branch to GitHub::
 89 | 
 90 |     $ git add .
 91 |     $ git commit -m "Your detailed description of your changes."
 92 |     $ git push origin name-of-your-bugfix-or-feature
 93 | 
 94 | 7. Submit a pull request through the GitHub website.
 95 | 
 96 | Pull Request Guidelines
 97 | -----------------------
 98 | 
 99 | Before you submit a pull request, check that it meets these guidelines:
100 | 
101 | 1. The pull request should include tests.
102 | 2. If the pull request adds functionality, the docs should be updated. Put
103 |    your new functionality into a function with a docstring, and add the
104 |    feature to the list in README.rst.
105 | 3. The pull request should work for Python 3.5, 3.6, 3.7 and 3.8, and for PyPy. Check
106 |    https://travis-ci.com/hrishikeshrt/google_drive_ocr/pull_requests
107 |    and make sure that the tests pass for all supported Python versions.
108 | 
109 | Tips
110 | ----
111 | 
112 | To run a subset of tests::
113 | 
114 | $ pytest tests.test_google_drive_ocr
115 | 
116 | 
117 | Deploying
118 | ---------
119 | 
120 | A reminder for the maintainers on how to deploy.
121 | Make sure all your changes are committed (including an entry in HISTORY.rst).
122 | Then run::
123 | 
124 | $ bump2version patch # possible: major / minor / patch
125 | $ git push
126 | $ git push --tags
127 | 
128 | Travis will then deploy to PyPI if tests pass.
129 | 


--------------------------------------------------------------------------------
/google_drive_ocr/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Utility Functions
  5 | """
  6 | 
  7 | import os
  8 | import logging
  9 | from collections.abc import Iterable
 10 | from typing import Generator, Iterator, List, Set, Tuple
 11 | 
 12 | from pdf2image import convert_from_path
 13 | from pdf2image.generators import threadsafe
 14 | 
 15 | ###############################################################################
 16 | 
 17 | LOGGER = logging.getLogger(__name__)
 18 | 
 19 | ###############################################################################
 20 | 
 21 | 
 22 | def get_files(topdir: str, extn: str) -> Generator[str, None, None]:
 23 |     """
 24 |     Search :code:`topdir` recursively for all files with extension :code:`extn`
 25 | 
 26 |     extension is checked with :code:`str.endswith()`, instead of the supposedly
 27 |     better :code:`os.path.splitext()`, in order to facilitate the search with
 28 |     multiple dots in the :code:`extn`
 29 | 
 30 |     i.e.
 31 |     :code:`>>> get_files(topdir, ".xyz.txt")`
 32 |     wouldn't have worked as expected if :code:`splitext()` was used.
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     topdir : str
 37 |         Path of the directory to search files in
 38 |     extn : str
 39 |         Extension to look for
 40 | 
 41 |     Returns
 42 |     -------
 43 |     Generator[str, None, None]
 44 |         Matching file paths
 45 |     """
 46 |     return (
 47 |         os.path.join(dirpath, name)
 48 |         for dirpath, dirnames, files in os.walk(topdir)
 49 |         for name in files
 50 |         if name.lower().endswith(extn.lower())
 51 |     )
 52 | 
 53 | 
 54 | ###############################################################################
 55 | # PDF Utils
 56 | 
 57 | 
 58 | def list_to_range(list_of_int: List[int]) -> List[Tuple[int, int]]:
 59 |     """Convert a list of integers into a list of ranges
 60 | 
 61 |     A range is tuple (start, end)
 62 | 
 63 |     Parameters
 64 |     ----------
 65 |     list_of_int : List[int]
 66 |         List of integers
 67 | 
 68 |     Returns
 69 |     -------
 70 |     List[Tuple[int, int]]
 71 |         List of ranges
 72 |     """
 73 |     ranges = []
 74 |     start, end = None, None
 75 |     last = None
 76 |     for current in sorted(set(list_of_int)):
 77 |         if current == int(current):
 78 |             current = int(current)
 79 |         else:
 80 |             continue
 81 |         if last is None:
 82 |             start = current
 83 |             last = current
 84 |         else:
 85 |             if current != last + 1:
 86 |                 end = last
 87 |                 ranges.append((start, end))
 88 |                 start = current
 89 |             last = current
 90 |     ranges.append((start, last))
 91 |     return ranges
 92 | 
 93 | 
 94 | # Static Name Generator
 95 | @threadsafe
 96 | def static_generator(prefix):
 97 |     while True:
 98 |         yield prefix
 99 | 
100 | 
101 | def extract_pages(
102 |     pdf_path: str,
103 |     pages: Iterator[Tuple[int, int]] = None
104 | ) -> Set[str]:
105 |     """Extract pages from a PDF file as image files
106 | 
107 |     Pages are saved in the same directory as the PDF file,
108 |     with the suffix :code:`.page-[number].jpg`
109 | 
110 |     Parameters
111 |     ----------
112 |     pdf_path : str
113 |         Path to the PDF file
114 |     pages : Iterator[Tuple[int, int]], optional
115 |         Page ranges to extract.
116 |         If None, all pages will be extracted.
117 |         The default is None.
118 | 
119 |     Returns
120 |     -------
121 |     Set[str]
122 |         Set of paths to extracted pages
123 |     """
124 |     pdf_path = os.path.realpath(pdf_path)
125 |     output_path = os.path.dirname(pdf_path)
126 |     output_name, _ = os.path.splitext(os.path.basename(pdf_path))
127 | 
128 |     if isinstance(pages, Iterable):
129 |         LOGGER.info(f"Extracting {len(pages)} pages from '{pdf_path}' ..")
130 |         ranges = list_to_range(pages)
131 |     else:
132 |         LOGGER.info(f"Extracting all pages from '{pdf_path}' ..")
133 |         ranges = [(None, None)]
134 | 
135 |     paths = set()
136 |     for _start, _end in ranges:
137 |         _paths = convert_from_path(
138 |             pdf_path=pdf_path,
139 |             output_folder=output_path,
140 |             first_page=_start,
141 |             last_page=_end,
142 |             fmt="jpeg",
143 |             jpegopt={"quality": 100, "progressive": True, "optimize": True},
144 |             output_file=static_generator(f"{output_name}.page"),
145 |             paths_only=True,
146 |         )
147 |         paths.update(_paths)
148 |         if _start is not None and _end is not None:
149 |             LOGGER.info(f"Extracted {len(_paths)} pages: {_start} to {_end}.")
150 |         else:
151 |             LOGGER.info(f"Extracted {len(_paths)} pages.")
152 |     return paths
153 | 
154 | 
155 | ###############################################################################
156 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | =========================
  2 | Google OCR (Drive API v3)
  3 | =========================
  4 | 
  5 | 
  6 | .. image:: https://img.shields.io/pypi/v/google_drive_ocr?color=success
  7 |         :target: https://pypi.python.org/pypi/google_drive_ocr
  8 | 
  9 | .. image:: https://readthedocs.org/projects/google-drive-ocr/badge/?version=latest
 10 |         :target: https://google-drive-ocr.readthedocs.io/en/latest/?version=latest
 11 |         :alt: Documentation Status
 12 | 
 13 | .. image:: https://img.shields.io/pypi/pyversions/google_drive_ocr
 14 |         :target: https://pypi.python.org/pypi/google_drive_ocr
 15 |         :alt: Python Version Support
 16 | 
 17 | .. image:: https://img.shields.io/github/issues/hrishikeshrt/google_drive_ocr
 18 |         :target: https://github.com/hrishikeshrt/google_drive_ocr/issues
 19 |         :alt: GitHub Issues
 20 | 
 21 | .. image:: https://img.shields.io/github/followers/hrishikeshrt?style=social
 22 |         :target: https://github.com/hrishikeshrt
 23 |         :alt: GitHub Followers
 24 | 
 25 | .. image:: https://img.shields.io/twitter/follow/hrishikeshrt?style=social
 26 |         :target: https://twitter.com/hrishikeshrt
 27 |         :alt: Twitter Followers
 28 | 
 29 | 
 30 | Perform OCR using Google's Drive API v3
 31 | 
 32 | 
 33 | * Free software: GNU General Public License v3
 34 | * Documentation: https://google-drive-ocr.readthedocs.io.
 35 | 
 36 | Features
 37 | ========
 38 | 
 39 | * Perform OCR using Google's Drive API v3
 40 | * Class :code:`GoogleOCRApplication()` for use in projects
 41 | * Highly configurable CLI
 42 | * Run OCR on a single image file
 43 | * Run OCR on multiple image files
 44 | * Run OCR on all images in directory
 45 | * Use multiple workers (:code:`multiprocessing`)
 46 | * Work on a PDF document directly
 47 | 
 48 | Usage
 49 | =====
 50 | 
 51 | Using in a Project
 52 | ------------------
 53 | 
 54 | Create a :code:`GoogleOCRApplication` application instance:
 55 | 
 56 | .. code-block:: python
 57 | 
 58 |     from google_drive_ocr import GoogleOCRApplication
 59 | 
 60 |     app = GoogleOCRApplication('client_secret.json')
 61 | 
 62 | Perform OCR on a single image:
 63 | 
 64 | .. code-block:: python
 65 | 
 66 |     app.perform_ocr('image.png')
 67 | 
 68 | 
 69 | Perform OCR on mupltiple images:
 70 | 
 71 | .. code-block:: python
 72 | 
 73 |     app.perform_ocr_batch(['image_1.png', 'image_2.png', 'image_3.png'])
 74 | 
 75 | Perform OCR on multiple images using multiple workers (:code:`multiprocessing`):
 76 | 
 77 | .. code-block:: python
 78 | 
 79 |     app.perform_ocr_batch(['image_1.png', 'image_3.png', 'image_2.png'], workers=2)
 80 | 
 81 | 
 82 | Using Command Line Interface
 83 | ----------------------------
 84 | 
 85 | Typical usage with several options:
 86 | 
 87 | .. code-block:: console
 88 | 
 89 |     google-ocr --client-secret client_secret.json \
 90 |     --upload-folder-id <google-drive-folder-id>  \
 91 |     --image-dir images/ --extension .jpg \
 92 |     --workers 4 --no-keep
 93 | 
 94 | Show help message with the full set of options:
 95 | 
 96 | .. code-block:: console
 97 | 
 98 |     google-ocr --help
 99 | 
100 | Configuration
101 | ^^^^^^^^^^^^^
102 | 
103 | The default location for configuration is :code:`~/.gdo.cfg`.
104 | If configuration is written to this location with a set of options,
105 | we don't have to specify those options again on the subsequent runs.
106 | 
107 | Save configuration and exit:
108 | 
109 | .. code-block:: console
110 | 
111 |     google-ocr --client-secret client_secret.json --write-config ~/.gdo.cfg
112 | 
113 | 
114 | Read configuration from a custom location (if it was written to a custom location):
115 | 
116 | .. code-block:: console
117 | 
118 |     google-ocr --config ~/.my_config_file ..
119 | 
120 | Performing OCR
121 | ^^^^^^^^^^^^^^
122 | 
123 | **Note**: It is assumed that the :code:`client-secret` option is saved in configuration file.
124 | 
125 | Single image file:
126 | 
127 | .. code-block:: console
128 | 
129 |     google-ocr -i image.png
130 | 
131 | Multiple image files:
132 | 
133 | .. code-block:: console
134 | 
135 |     google-ocr -b image_1.png image_2.png image_3.png
136 | 
137 | All image files from a directory with a specific extension:
138 | 
139 | .. code-block:: console
140 | 
141 |     google-ocr --image-dir images/ --extension .png
142 | 
143 | Multiple workers (:code:`multiprocessing`):
144 | 
145 | .. code-block:: console
146 | 
147 |     google-ocr -b image_1.png image_2.png image_3.png --workers 2
148 | 
149 | PDF files:
150 | 
151 | .. code-block:: console
152 | 
153 |     google-ocr --pdf document.pdf --pages 1-3 5 7-10 13
154 | 
155 | 
156 | **Note**:
157 | You must setup a Google application and download :code:`client_secrets.json` file before using :code:`google_drive_ocr`.
158 | 
159 | Setup Instructions
160 | ==================
161 | 
162 | Create a project on Google Cloud Platform
163 | 
164 | **Wizard**: https://console.developers.google.com/start/api?id=drive
165 | 
166 | **Instructions**:
167 | 
168 |     * https://cloud.google.com/genomics/downloading-credentials-for-api-access
169 |     * Select application type as "Installed Application"
170 |     * Create credentials OAuth consent screen --> OAuth client ID
171 |     * Save :code:`client_secret.json`
172 | 


--------------------------------------------------------------------------------
/google_drive_ocr/errors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | HTTP Errors
  5 | ===========
  6 | 
  7 | List of HTTP errors that can be fixed in most cases by trying again.
  8 | 
  9 | Provides a :code:`@retry` decorator, which applies exponential backoff
 10 | to a function.
 11 | """
 12 | 
 13 | import math
 14 | import time
 15 | import random
 16 | import logging
 17 | import functools
 18 | from typing import Any, Callable
 19 | 
 20 | from googleapiclient.errors import HttpError
 21 | 
 22 | ###############################################################################
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | ###############################################################################
 27 | # https://developers.google.com/drive/api/v3/handle-errors
 28 | 
 29 | RETRY_ERRORS = {
 30 |     # 400: ["Bad request", "Invalid sharing request"],
 31 |     # 401: ["Invalid credentials"],
 32 |     403: ["Usage limit exceeded", "Daily limit exceeded",
 33 |           "Number of items in folder", "User rate limit exceeded",
 34 |           "Rate limit exceeded", "Sharing rate limit exceeded",
 35 |           "The user has not granted the app access to the file",
 36 |           "The user does not have sufficient permissions for the file",
 37 |           "App cannot be used within the authenticated user's domain"],
 38 |     404: ["File not found"],
 39 |     429: ["Too many requests"],
 40 |     500: ["Backend error"],
 41 |     502: ["Bad Gateway"],
 42 |     503: ["Service Unavailable"],
 43 |     504: ["Gateway Timeout"]
 44 | }
 45 | 
 46 | ###############################################################################
 47 | 
 48 | 
 49 | def retry(
 50 |     attempts: int = 4,
 51 |     delay: int = 1,
 52 |     backoff: int = 2,
 53 |     hook: Callable[[int, Exception, int], Any] = None
 54 | ) -> Callable:
 55 |     """
 56 |     Decorator to Retry with Exponential Backoff (on Exception)
 57 | 
 58 |     A function that raises an exception on failure, when decorated with this
 59 |     decorator, will retry till it returns True or number of attempts runs out.
 60 | 
 61 |     The decorator will call the function up to :code:`attempts` times if it
 62 |     raises an exception.
 63 | 
 64 |     By default it catches instances of the Exception class and subclasses.
 65 |     This will recover after all but the most fatal errors. You may specify a
 66 |     custom tuple of exception classes with the :code:`exceptions` argument;
 67 |     the function will only be retried if it raises one of the specified
 68 |     exceptions.
 69 | 
 70 |     Additionally you may specify a hook function which will be called prior
 71 |     to retrying with the number of remaining tries and the exception instance;
 72 |     This is primarily intended to give the opportunity to log the failure.
 73 |     Hook is not called after failure if no retries remain.
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     attempts : int, optional
 78 |         Number of attempts in case of failure.
 79 |         The default is 4.
 80 |     delay : int, optional
 81 |         Intinitial delay in seconds
 82 |         The default is 1.
 83 |     backoff : int, optional
 84 |         Backoff multiplication factor
 85 |         The default is 2.
 86 |     hook : Callable[[int, Exception, int], Any], optional
 87 |         Function with the parameters `(tries_remaining, exception, delay)`
 88 |         The default is None.
 89 | 
 90 |     Returns
 91 |     -------
 92 |     Callable
 93 |         Decorator function
 94 | 
 95 |     Raises
 96 |     ------
 97 |     ValueError
 98 |         If the :code:`backoff` multiplication factor is less than 1.
 99 |     ValueError
100 |         If the number of :code:`attempts` is less than 0.
101 |     ValueError
102 |         If the initial :code:`delay` is less than or equal to 0.
103 |     """
104 | 
105 |     if backoff <= 1:
106 |         raise ValueError("Backoff must be greater than 1")
107 |     attempts = math.floor(attempts)
108 |     if attempts < 0:
109 |         raise ValueError("Attempts must be 0 or greater")
110 |     if delay <= 0:
111 |         raise ValueError("Delay must be greater than 0")
112 | 
113 |     def decorator(func):
114 |         # ------------------------------------------------------------------- #
115 |         @functools.wraps(func)
116 |         def wrapper(*args, **kwargs):
117 |             _delay = delay
118 |             for tries_remaining in range(attempts, -1, -1):
119 |                 try:
120 |                     return func(*args, **kwargs)
121 |                 except HttpError as error:
122 |                     if error.resp.status in RETRY_ERRORS:
123 |                         if tries_remaining > 0:
124 |                             if hook is not None:
125 |                                 hook(tries_remaining, error, _delay)
126 |                             logger.warning(
127 |                                 f"{error.resp.status}: {error.resp.reason}"
128 |                             )
129 |                             logger.info(f"Retrying in {_delay} seconds ..")
130 |                             time.sleep(_delay + random.random())
131 |                             _delay *= backoff
132 |                         else:
133 |                             logger.error("Failed number of attempts exceeded.")
134 |                     else:
135 |                         logger.error(
136 |                             f"{error.resp.status}: {error.resp.reason}"
137 |                         )
138 |                         raise
139 |                 else:
140 |                     break
141 |         # ------------------------------------------------------------------- #
142 |         return wrapper
143 |     return decorator
144 | 
145 | ###############################################################################
146 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # google_drive_ocr documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Jun  9 13:47:02 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another
 16 | # directory, add these directories to sys.path here. If the directory is
 17 | # relative to the documentation root, use os.path.abspath to make it
 18 | # absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | sys.path.insert(0, os.path.abspath('..'))
 23 | 
 24 | import google_drive_ocr
 25 | 
 26 | # -- General configuration ---------------------------------------------
 27 | 
 28 | # If your documentation needs a minimal Sphinx version, state it here.
 29 | #
 30 | # needs_sphinx = '1.0'
 31 | 
 32 | # Add any Sphinx extension module names here, as strings. They can be
 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 34 | extensions = [
 35 |     'sphinx.ext.autodoc',
 36 |     'sphinx.ext.napoleon',
 37 |     'sphinx.ext.viewcode',
 38 |     'sphinx_rtd_theme'
 39 | ]
 40 | 
 41 | # Autodoc settings
 42 | autoclass_content = 'both'
 43 | autodoc_member_order = 'bysource'
 44 | 
 45 | # Napoleon settings
 46 | napoleon_numpy_docstring = True
 47 | napoleon_include_init_with_doc = True
 48 | napoleon_include_private_with_doc = True
 49 | napoleon_include_special_with_doc = True
 50 | napoleon_use_param = True
 51 | napoleon_use_rtype = True
 52 | 
 53 | # Add any paths that contain templates here, relative to this directory.
 54 | templates_path = ['_templates']
 55 | 
 56 | # The suffix(es) of source filenames.
 57 | # You can specify multiple suffix as a list of string:
 58 | #
 59 | # source_suffix = ['.rst', '.md']
 60 | source_suffix = '.rst'
 61 | 
 62 | # The master toctree document.
 63 | master_doc = 'index'
 64 | 
 65 | # General information about the project.
 66 | project = 'Google OCR (Drive API v3)'
 67 | copyright = "2022, Hrishikesh Terdalkar"
 68 | author = "Hrishikesh Terdalkar"
 69 | 
 70 | # The version info for the project you're documenting, acts as replacement
 71 | # for |version| and |release|, also used in various other places throughout
 72 | # the built documents.
 73 | #
 74 | # The short X.Y version.
 75 | version = google_drive_ocr.__version__
 76 | # The full version, including alpha/beta/rc tags.
 77 | release = google_drive_ocr.__version__
 78 | 
 79 | # The language for content autogenerated by Sphinx. Refer to documentation
 80 | # for a list of supported languages.
 81 | #
 82 | # This is also used if you do content translation via gettext catalogs.
 83 | # Usually you set "language" from the command line for these cases.
 84 | language = None
 85 | 
 86 | # List of patterns, relative to source directory, that match files and
 87 | # directories to ignore when looking for source files.
 88 | # This patterns also effect to html_static_path and html_extra_path
 89 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 90 | 
 91 | # The name of the Pygments (syntax highlighting) style to use.
 92 | pygments_style = 'sphinx'
 93 | 
 94 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 95 | todo_include_todos = False
 96 | 
 97 | 
 98 | # -- Options for HTML output -------------------------------------------
 99 | 
100 | # The theme to use for HTML and HTML Help pages.  See the documentation for
101 | # a list of builtin themes.
102 | #
103 | # html_theme = 'alabaster'
104 | html_theme = 'sphinx_rtd_theme'
105 | 
106 | # Theme options are theme-specific and customize the look and feel of a
107 | # theme further.  For a list of options available for each theme, see the
108 | # documentation.
109 | #
110 | # html_theme_options = {}
111 | 
112 | # Add any paths that contain custom static files (such as style sheets) here,
113 | # relative to this directory. They are copied after the builtin static files,
114 | # so a file named "default.css" will overwrite the builtin "default.css".
115 | html_static_path = ['_static']
116 | 
117 | 
118 | # -- Options for HTMLHelp output ---------------------------------------
119 | 
120 | # Output file base name for HTML help builder.
121 | htmlhelp_basename = 'google_drive_ocrdoc'
122 | 
123 | 
124 | # -- Options for LaTeX output ------------------------------------------
125 | 
126 | latex_elements = {
127 |     # The paper size ('letterpaper' or 'a4paper').
128 |     #
129 |     # 'papersize': 'letterpaper',
130 | 
131 |     # The font size ('10pt', '11pt' or '12pt').
132 |     #
133 |     # 'pointsize': '10pt',
134 | 
135 |     # Additional stuff for the LaTeX preamble.
136 |     #
137 |     # 'preamble': '',
138 | 
139 |     # Latex figure (float) alignment
140 |     #
141 |     # 'figure_align': 'htbp',
142 | }
143 | 
144 | # Grouping the document tree into LaTeX files. List of tuples
145 | # (source start file, target name, title, author, documentclass
146 | # [howto, manual, or own class]).
147 | latex_documents = [
148 |     (master_doc, 'google_drive_ocr.tex',
149 |      'Google OCR (Drive API v3) Documentation',
150 |      'Hrishikesh Terdalkar', 'manual'),
151 | ]
152 | 
153 | 
154 | # -- Options for manual page output ------------------------------------
155 | 
156 | # One entry per manual page. List of tuples
157 | # (source start file, name, description, authors, manual section).
158 | man_pages = [
159 |     (master_doc, 'google_drive_ocr',
160 |      'Google OCR (Drive API v3) Documentation',
161 |      [author], 1)
162 | ]
163 | 
164 | 
165 | # -- Options for Texinfo output ----------------------------------------
166 | 
167 | # Grouping the document tree into Texinfo files. List of tuples
168 | # (source start file, target name, title, author,
169 | #  dir menu entry, description, category)
170 | texinfo_documents = [
171 |     (master_doc, 'google_drive_ocr',
172 |      'Google OCR (Drive API v3) Documentation',
173 |      author,
174 |      'google_drive_ocr',
175 |      "Perform OCR using Google's Drive API v3",
176 |      'Miscellaneous'),
177 | ]
178 | 
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/google_drive_ocr/cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Console script for Google OCR (Drive API v3)
  5 | """
  6 | 
  7 | ###############################################################################
  8 | 
  9 | import re
 10 | import sys
 11 | import time
 12 | import logging
 13 | 
 14 | import configargparse
 15 | 
 16 | ###############################################################################
 17 | 
 18 | from . import __version__
 19 | from .application import GoogleOCRApplication
 20 | from .utils import get_files, extract_pages
 21 | 
 22 | ###############################################################################
 23 | 
 24 | ROOT_LOGGER = logging.getLogger()
 25 | ROOT_LOGGER.hasHandlers() or ROOT_LOGGER.addHandler(logging.StreamHandler())
 26 | 
 27 | ###############################################################################
 28 | 
 29 | 
 30 | def main():
 31 |     # ----------------------------------------------------------------------- #
 32 |     # Default Config
 33 | 
 34 |     class Config:
 35 |         image = None
 36 |         batch = None
 37 |         image_dir = None
 38 |         output = None
 39 |         extension = ".png"
 40 |         suffix = ".google.txt"
 41 |         pdf = None
 42 | 
 43 |         pages = None
 44 |         client_secret = None
 45 |         upload_folder_id = None
 46 | 
 47 |         workers = 1
 48 |         no_keep = False
 49 |         verbose = False
 50 |         debug = False
 51 | 
 52 |     # ----------------------------------------------------------------------- #
 53 | 
 54 |     p = configargparse.ArgumentParser(
 55 |         default_config_files=["~/.gdo.cfg"],
 56 |         auto_env_var_prefix="GDO_",
 57 |         description="Google OCR using Drive API v3",
 58 |         args_for_setting_config_path=["-c", "--config"],
 59 |         config_arg_help_message="Read configuration from file",
 60 |         args_for_writing_out_config_file=["-w", "--write-config"],
 61 |         write_out_config_file_arg_help_message="Write configuration file"
 62 |     )
 63 |     p.add_argument("--client-secret", required=True,
 64 |                    help="Path to client secret file")
 65 |     p.add_argument("-i", "--image", help="Path to a single image file")
 66 |     p.add_argument("-b", "--batch", nargs="+", help="Paths image files")
 67 |     p.add_argument("-d", "--image-dir", help="Path to image directory")
 68 |     p.add_argument("-o", "--output",
 69 |                    help="Path to output file (only valid with `-i`)")
 70 |     p.add_argument("-x", "--extension",
 71 |                    help="Extension to look in image directory")
 72 |     p.add_argument("-s", "--suffix", help="Suffix for the output files")
 73 |     p.add_argument("--pdf", help="Path to PDF file")
 74 |     p.add_argument("--pages", nargs="*",
 75 |                    help="Pages from PDF to extract and OCR")
 76 |     p.add_argument("--upload-folder-id",
 77 |                    help="Google Drive folder id to upload files to")
 78 |     p.add_argument("--workers", type=int,
 79 |                    help="Number of workers (multiprocessing)")
 80 |     p.add_argument("--no-keep", action="store_true",
 81 |                    help="Delete file from Google Drive after OCR is performed")
 82 |     p.add_argument("--verbose", action="store_true", help="Verbose output")
 83 |     p.add_argument("--debug", action="store_true", help="Debug mode")
 84 |     p.add_argument("--version", action="version",
 85 |                    version=f"%(prog)s {__version__}")
 86 | 
 87 |     p.parse_args(namespace=Config)
 88 | 
 89 |     # ----------------------------------------------------------------------- #
 90 | 
 91 |     disable_tqdm = True
 92 |     if Config.debug:
 93 |         ROOT_LOGGER.setLevel(logging.DEBUG)
 94 |     elif Config.verbose:
 95 |         ROOT_LOGGER.setLevel(logging.INFO)
 96 |     else:
 97 |         disable_tqdm = False
 98 | 
 99 |     # ----------------------------------------------------------------------- #
100 | 
101 |     ROOT_LOGGER.debug(Config.__dict__)
102 | 
103 |     # ----------------------------------------------------------------------- #
104 | 
105 |     if (
106 |         Config.image is None
107 |         and Config.batch is None
108 |         and Config.image_dir is None
109 |         and Config.pdf is None
110 |     ):
111 |         p.print_help()
112 |         return 1
113 | 
114 |     # ----------------------------------------------------------------------- #
115 |     # Create Application Instance
116 | 
117 |     app = GoogleOCRApplication(
118 |         client_secret=Config.client_secret,
119 |         upload_folder_id=Config.upload_folder_id,
120 |         temporary_upload=Config.no_keep,
121 |         ocr_suffix=Config.suffix
122 |     )
123 | 
124 |     # ----------------------------------------------------------------------- #
125 |     # Single image file
126 | 
127 |     if Config.image is not None:
128 |         t_start = time.perf_counter()
129 |         output_path = (
130 |             app.get_output_path(Config.image)
131 |             if Config.output is None
132 |             else Config.output
133 |         )
134 | 
135 |         status = app.perform_ocr(Config.image, output_path=output_path)
136 |         t_finish = time.perf_counter()
137 |         print(f"{status.value} ({t_finish-t_start:.4f} seconds)")
138 | 
139 |         with open(output_path, "r", encoding="utf-8") as f:
140 |             print(f.read())
141 |         return 0
142 | 
143 |     # ----------------------------------------------------------------------- #
144 |     # Multiple images
145 | 
146 |     image_files = []
147 |     # Multiple images on command line
148 |     if Config.batch is not None:
149 |         image_files = Config.batch
150 | 
151 |     # Find images from a directory
152 |     if Config.image_dir is not None:
153 |         image_files = get_files(Config.image_dir, Config.extension)
154 | 
155 |     # Extract pages from a PDF file
156 |     if Config.pdf is not None:
157 |         if Config.pages is not None:
158 |             pages = []
159 |             for page in Config.pages:
160 |                 if page.isdigit():
161 |                     pages.append(int(page))
162 | 
163 |                 m = re.match(r"^(\d+)-(\d+)$", page)
164 |                 if m:
165 |                     pages.extend(range(int(m.group(1)), int(m.group(2)) + 1))
166 |         else:
167 |             pages = None
168 |         image_files = extract_pages(Config.pdf, pages=pages)
169 | 
170 |     if image_files:
171 |         app.perform_ocr_batch(
172 |             image_files,
173 |             workers=Config.workers,
174 |             disable_tqdm=disable_tqdm
175 |         )
176 |         return 0
177 | 
178 |     return 1
179 | 
180 | ###############################################################################
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     sys.exit(main())  # pragma: no cover
185 | 


--------------------------------------------------------------------------------
/google_drive_ocr/application.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Google OCR Application
  5 | ======================
  6 | 
  7 | Create a project on Google Cloud Platform
  8 | -----------------------------------------
  9 | 
 10 | Wizard: https://console.developers.google.com/start/api?id=drive
 11 | 
 12 | **Instructions**:
 13 | 
 14 | * https://cloud.google.com/genomics/downloading-credentials-for-api-access
 15 | * Select application type as "Installed Application"
 16 | * Create credentials OAuth consent screen --> OAuth client ID
 17 | * Save client_secret.json
 18 | 
 19 | References
 20 | ----------
 21 | 
 22 | * https://developers.google.com/api-client-library/python/start/get_started
 23 | * https://developers.google.com/drive/v3/reference/
 24 | * https://developers.google.com/drive/v3/web/quickstart/python
 25 | """
 26 | 
 27 | ###############################################################################
 28 | 
 29 | 
 30 | import io
 31 | import os
 32 | import time
 33 | import enum
 34 | import logging
 35 | import mimetypes
 36 | import multiprocessing as mp
 37 | from dataclasses import dataclass, field
 38 | 
 39 | from tqdm import tqdm
 40 | from tqdm.contrib.logging import logging_redirect_tqdm
 41 | from natsort import natsorted
 42 | 
 43 | from googleapiclient.discovery import build
 44 | from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload
 45 | 
 46 | from google_auth_oauthlib.flow import InstalledAppFlow
 47 | from google.auth.transport.requests import Request
 48 | from google.oauth2.credentials import Credentials
 49 | 
 50 | ###############################################################################
 51 | 
 52 | from .errors import retry
 53 | 
 54 | ###############################################################################
 55 | 
 56 | LOGGER = logging.getLogger(__name__)
 57 | 
 58 | ###############################################################################
 59 | 
 60 | SCOPES = ["https://www.googleapis.com/auth/drive"]
 61 | 
 62 | ###############################################################################
 63 | 
 64 | 
 65 | class Status(enum.Enum):
 66 |     SUCCESS = "Done!"
 67 |     ALREADY = "Already done!"
 68 |     ERROR = "Something went wrong!"
 69 | 
 70 | ###############################################################################
 71 | 
 72 | 
 73 | @dataclass
 74 | class GoogleOCRApplication:
 75 |     """
 76 |     Google OCR Application
 77 | 
 78 |     Perform OCR using Google-Drive API v3
 79 |     """
 80 |     client_secret: str
 81 |     upload_folder_id: str = field(default=None)
 82 |     ocr_suffix: str = field(default=".google.txt")
 83 |     temporary_upload: bool = field(default=False)
 84 | 
 85 |     credentials_path: str = field(default=None, repr=False)
 86 |     scopes: str = field(default=None)
 87 | 
 88 |     def __post_init__(self):
 89 |         if self.scopes is None:
 90 |             self.scopes = SCOPES
 91 | 
 92 |         if self.credentials_path is None:
 93 |             self.credentials_path = os.path.join(
 94 |                 os.path.expanduser("~"), ".credentials", "token.json"
 95 |             )
 96 |         if self.upload_folder_id is None:
 97 |             self.upload_folder_id = "root"
 98 |         creds = self.get_credentials()
 99 |         self.drive_service = build("drive", "v3", credentials=creds)
100 | 
101 |     def get_output_path(self, img_path: str) -> str:
102 |         """Get the output path
103 | 
104 |         Output path is constructed by replacing the extension
105 |         in :code:`img_path` with :code:`ocr_suffix`
106 | 
107 |         Parameters
108 |         ----------
109 |         img_path : str
110 |             Path to the input image file
111 | 
112 |         Returns
113 |         -------
114 |         str
115 |             Output path
116 |         """
117 |         _img_path, _ = os.path.splitext(img_path)
118 |         return f"{_img_path}{self.ocr_suffix}"
119 | 
120 |     def get_credentials(self) -> Credentials:
121 |         """Get valid user credentials
122 | 
123 |         If no (valid) credentials are available,
124 |         * Log the user in
125 |         * Store the credentials for future use
126 | 
127 |         Returns
128 |         -------
129 |         Credentials or None
130 |             Valid user credentials
131 |         """
132 |         if os.path.isfile(self.credentials_path):
133 |             creds = Credentials.from_authorized_user_file(
134 |                 self.credentials_path, self.scopes
135 |             )
136 |         else:
137 |             credential_dir = os.path.dirname(self.credentials_path)
138 |             os.makedirs(credential_dir, exist_ok=True)
139 |             creds = None
140 | 
141 |         # If there are no (valid) credentials available, let the user log in.
142 |         if not creds or not creds.valid:
143 |             if creds and creds.expired and creds.refresh_token:
144 |                 creds.refresh(Request())
145 |             else:
146 |                 flow = InstalledAppFlow.from_client_secrets_file(
147 |                     client_secrets_file=self.client_secret,
148 |                     scopes=self.scopes
149 |                 )
150 |                 creds = flow.run_local_server(port=0)
151 |             # Save the credentials for the next run
152 |             LOGGER.info(f"Storing credentials to {self.credentials_path}")
153 |             with open(self.credentials_path, "w") as token:
154 |                 token.write(creds.to_json())
155 | 
156 |         return creds
157 | 
158 |     # ----------------------------------------------------------------------- #
159 |     # Drive Actions
160 | 
161 |     @retry()
162 |     def upload_image_as_document(self, img_path: str) -> str:
163 |         """Upload an image file as a Google Document
164 | 
165 |         Parameters
166 |         ----------
167 |         img_path : str
168 |             Path to the image file
169 | 
170 |         Returns
171 |         -------
172 |         str
173 |             ID of the uploaded Google document
174 |         """
175 |         img_filename = os.path.basename(img_path)
176 |         mimetype, _encoding = mimetypes.guess_type(img_path)
177 | 
178 |         if mimetype is None:
179 |             LOGGER.warning("MIME type of the image could not be inferred.")
180 |             mimetype = "image/png"
181 | 
182 |         file_metadata = {
183 |             "name": img_filename,
184 |             "mimeType": "application/vnd.google-apps.document",
185 |             "parents": [self.upload_folder_id],
186 |         }
187 | 
188 |         media = MediaFileUpload(img_path, mimetype=mimetype)
189 |         file = self.drive_service.files().create(
190 |             body=file_metadata, media_body=media, fields="id, name"
191 |         ).execute()
192 |         file_id = file.get("id")
193 |         file_name = file.get("name")
194 |         LOGGER.info(f"File uploaded: '{file_name}' (id: '{file_id}')")
195 |         return file_id
196 | 
197 |     @retry()
198 |     def download_document_as_text(self, file_id: str, output_path: str):
199 |         """Download a Google Document as text
200 | 
201 |         Parameters
202 |         ----------
203 |         file_id : str
204 |             ID of the Google document
205 |         output_path : str
206 |             Path to where the document should be downloaded
207 |         """
208 |         request = self.drive_service.files().export_media(
209 |             fileId=file_id, mimeType="text/plain"
210 |         )
211 |         fh = io.FileIO(output_path, "wb")
212 |         downloader = MediaIoBaseDownload(fh, request)
213 |         done = False
214 |         while done is False:
215 |             status, done = downloader.next_chunk()
216 |         LOGGER.info(f"Document downloaded: '{output_path}'.")
217 | 
218 |     @retry()
219 |     def delete_file(self, file_id: str):
220 |         """Delete a file from Google Drive
221 | 
222 |         Parameters
223 |         ----------
224 |         file_id : str
225 |             ID of the file on Google Drive to be deleted
226 |         """
227 |         self.drive_service.files().delete(fileId=file_id).execute()
228 |         LOGGER.info(f"File '{file_id}' deleted from Google Drive.")
229 | 
230 |     def perform_ocr(self, img_path: str, output_path: str = None) -> Status:
231 |         """
232 |         Perform OCR on a single image
233 | 
234 |         * Upload the image to Google Drive as google-document
235 |         * [Google adds OCR layer to the image]
236 |         * Download the google-document as plain text
237 | 
238 |         Parameters
239 |         ----------
240 |         img_path: str or Path
241 |             Path to the image file
242 |         output_path: str or Path, optional
243 |             Path where the OCR text should be stored
244 |             If None, a new file will be created beside the image
245 |             The default is None.
246 | 
247 |         Returns
248 |         -------
249 |         status: Status
250 |             Status of the OCR operation
251 |         """
252 |         if output_path is None:
253 |             output_path = self.get_output_path(img_path)
254 | 
255 |         if os.path.isfile(output_path):
256 |             return Status.ALREADY
257 | 
258 |         try:
259 |             file_id = self.upload_image_as_document(img_path)
260 |             if file_id:
261 |                 self.download_document_as_text(file_id, output_path)
262 | 
263 |                 if self.temporary_upload:
264 |                     self.delete_file(file_id)
265 |             else:
266 |                 LOGGER.error(f"Could not upload '{img_path}'.")
267 |                 return Status.ERROR
268 |         except Exception:
269 |             LOGGER.exception("An error occurred while performing OCR.")
270 |             return Status.ERROR
271 | 
272 |         return Status.SUCCESS
273 | 
274 |     def _worker_ocr_batch(self, worker_arguments: dict) -> float:
275 |         """Worker to perform OCR on multiple files
276 | 
277 |         Parameters
278 |         ----------
279 |         worker_arguments : dict
280 |             Arguments for the worker
281 | 
282 |         Returns
283 |         -------
284 |         float
285 |             Time taken in seconds
286 |         """
287 |         process = mp.current_process()
288 |         worker_id = worker_arguments["worker_id"]
289 |         image_files = worker_arguments["image_files"]
290 |         disable_tqdm = worker_arguments.get("disable_tqdm")
291 |         LOGGER.info(f"Process started. (PID: {process.pid})")
292 |         t_start = time.perf_counter()
293 |         with logging_redirect_tqdm():
294 |             for image_file in tqdm(
295 |                 natsorted(image_files),
296 |                 desc=f"(PID:{process.pid})",
297 |                 position=worker_id,
298 |                 disable=disable_tqdm
299 |             ):
300 |                 status = self.perform_ocr(image_file)
301 |                 if status == Status.ERROR:
302 |                     LOGGER.info(f"{status.value} ('{image_file}')")
303 | 
304 |         t_finish = time.perf_counter()
305 |         t_total = (t_finish - t_start)
306 |         LOGGER.info(f"Process complete. (PID: {process.pid})")
307 |         return t_total
308 | 
309 |     def perform_ocr_batch(
310 |         self,
311 |         image_files: list,
312 |         workers: int = 1,
313 |         disable_tqdm: bool = None
314 |     ):
315 |         """Perform OCR on multiple files
316 | 
317 |         Parameters
318 |         ----------
319 |         image_files : list
320 |             List of paths to image files
321 |         workers : int, optional
322 |             Number of workers
323 |             The default is 1.
324 |         disable_tqdm : bool, optional
325 |             If True, the progress bars from :code:`tqdm` will be disabled.
326 |             The default is None.
327 |         """
328 |         image_files = natsorted(image_files)
329 |         file_count = len(image_files)
330 | 
331 |         t_start = time.perf_counter()
332 | 
333 |         workload, extra = divmod(file_count, workers)
334 |         if workers > 1:
335 |             print(f"Total {file_count} files "
336 |                   f"distributed among {workers} workers.")
337 |             print(f"Workload: {workload}-{workload + 1} per worker")
338 | 
339 |         worker_arguments = []
340 |         _start = 0
341 |         for idx in range(workers):
342 |             _workload = workload + (idx < extra)
343 |             worker_arguments.append({
344 |                 "worker_id": idx,
345 |                 "image_files": image_files[_start:_start+_workload],
346 |                 "disable_tqdm": disable_tqdm
347 |             })
348 |             _start = _start + _workload
349 | 
350 |         # ------------------------------------------------------------------- #
351 | 
352 |         mp.freeze_support()
353 |         tqdm.set_lock(mp.RLock())
354 |         with mp.Pool(
355 |             workers,
356 |             initializer=tqdm.set_lock,
357 |             initargs=(tqdm.get_lock(),)
358 |         ) as p:
359 |             t_workers = p.map(self._worker_ocr_batch, worker_arguments)
360 | 
361 |         # ------------------------------------------------------------------- #
362 | 
363 |         t_final = time.perf_counter()
364 |         t_total = t_final - t_start
365 |         tqdm.write(f"Total Time Taken: {t_total:.2f} seconds")
366 |         if workers > 1:
367 |             tqdm.write(f"Time Saved: {sum(t_workers) - t_total:.2f} seconds")
368 | 
369 | ###############################################################################
370 | 


--------------------------------------------------------------------------------