├── comicsocr
    ├── src
    │   ├── __init__.py
    │   ├── config.py
    │   ├── reader.py
    │   ├── tokenizer.py
    │   └── api.py
    ├── __main__.py
    └── __init__.py
├── setup.py
├── LICENSE
├── .gitignore
└── README.md


/comicsocr/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/comicsocr/__main__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | '''
3 | Entry point for command-line tool.
4 | '''
5 | import comicsocr
6 | 
7 | comicsocr.run_main()
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open('README.md', 'r') as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name='comicsocr-largecats',  # Replace with your own username
 8 |     version='0.0.0',
 9 |     author='largecats',
10 |     author_email='linfanxiaolinda@outlook.com',
11 |     description=
12 |     'A tool for extracting script from comic pages using OCR engine Tesseract.',
13 |     long_description=long_description,
14 |     long_description_content_type='text/markdown',
15 |     url='https://github.com/largecats/comics-ocr',
16 |     packages=setuptools.find_packages(),
17 |     classifiers=[
18 |         'Programming Language :: Python',
19 |         'Programming Language :: Python :: 2',
20 |         'Programming Language :: Python :: 2.7',
21 |         'Programming Language :: Python :: 3',
22 |         'Programming Language :: Python :: 3.6',
23 |         'License :: OSI Approved :: MIT License',
24 |         'Operating System :: OS Independent',
25 |     ],
26 |     entry_points={
27 |         'console_scripts': ['comicsocr=comicsocr:run_main'],
28 |     })
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-present largecats
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .vscode/
  2 | .config/
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 


--------------------------------------------------------------------------------
/comicsocr/src/config.py:
--------------------------------------------------------------------------------
 1 | DEFAULT_CONFIG_SECTION = 'comicsocr'
 2 | 
 3 | 
 4 | class Config:
 5 |     '''
 6 |     Class for configurations.
 7 |     '''
 8 |     SIMPLE = 'simple'
 9 |     COMPLEX = 'complex'
10 | 
11 |     def __init__(self,
12 |                  speechBubbleSize={
13 |                      'width': [60,
14 |                                500],
15 |                      'height': [25,
16 |                                 500]
17 |                  },
18 |                  show=False,
19 |                  showWindowSize={'height': 768},
20 |                  charsAllowed=' -QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm,.?!1234567890"":;\'',
21 |                  method=None):
22 |         '''
23 |         Parameters
24 |         speechBubbleSize: dict
25 |             Height and width ranges for the speech bubbles.
26 |             Default to {'width': [60, 500],'height': [25, 500]}.
27 |         charsAllowed: string
28 |             Legitimate characters when reading from image.
29 |             Default to ' -QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm,.?!1234567890"":;\''.
30 |         method: string
31 |             Config.SIMPLE - recognizes only rectangular bubbles.
32 |             Config.COMPLEX - recognizes more complex bubble shapes.
33 |             Default to Config.SIMPLE.
34 |         show: boolean
35 |             If True, will show the image being processed with recognized contours.
36 |             Note: May not be available in Python's interactive terminal and may require special handling to show on Unix systems.
37 |             Default to False.
38 |         showWindowSize: dict
39 |             Size of the window when displaying the image being processed. 
40 |             E.g., {'height': 768} means scale the image to height 768 with the same aspect ratio. 
41 |             Default to {'height': 768}.
42 |         '''
43 |         self.speechBubbleSize = speechBubbleSize
44 |         self.charsAllowed = charsAllowed
45 |         self.method = method or Config.SIMPLE
46 |         self.show = show
47 |         self.showWindowSize = showWindowSize
48 | 


--------------------------------------------------------------------------------
/comicsocr/src/reader.py:
--------------------------------------------------------------------------------
 1 | import pytesseract
 2 | import cv2
 3 | import numpy as np
 4 | from matplotlib import pyplot as plt
 5 | import logging
 6 | import sys
 7 | 
 8 | from comicsocr.src.config import Config
 9 | from comicsocr.src.tokenizer import Tokenizer
10 | 
11 | logger = logging.getLogger(__name__)
12 | log_formatter = '[%(asctime)s] %(levelname)s [%(filename)s:%(lineno)s:%(funcName)s] %(message)s'
13 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_formatter)
14 | 
15 | 
16 | class Reader:
17 |     '''
18 |     Optical character reader.
19 |     '''
20 |     def __init__(self, config=Config()):
21 |         '''
22 |         Parameters
23 |         config: Config() object
24 |         '''
25 |         self.config = config
26 |         self.tokenizer = Tokenizer(config=config)
27 | 
28 |     def denoise(self, image, n):
29 |         '''
30 |         Denoise the given image with n iterations.
31 |         '''
32 |         for i in range(n):
33 |             image = cv2.fastNlMeansDenoisingColored(image)
34 | 
35 |         return image
36 | 
37 |     def read(self, imagePath):
38 |         '''
39 |         Apply the ocr engine to the given image and return the extracted scripts where illegitimate characters are filtered out.
40 | 
41 |         Parameters
42 |         imagePath: string
43 |             Path to the comic page image.
44 |         
45 |         Return: list
46 |             Strings of comic script extracted from the image.
47 |         '''
48 |         tokens = self.tokenizer.tokenize(imagePath=imagePath)
49 |         scripts = []
50 |         for token in tokens:
51 |             # enlarge
52 |             token = cv2.resize(token, (0, 0), fx=2, fy=2)
53 |             # denoise
54 |             token = self.denoise(image=token, n=2)
55 |             kernel = np.ones((1, 1), np.uint8)
56 |             token = cv2.dilate(token, kernel, iterations=50)
57 |             token = cv2.erode(token, kernel, iterations=50)
58 |             # turn gray
59 |             tokenGray = cv2.cvtColor(token, cv2.COLOR_BGR2GRAY)
60 |             # Gaussian filter
61 |             tokenGrayBlur = cv2.GaussianBlur(tokenGray, (5, 5), 0)
62 |             # edge detection
63 |             tokenGrayBlurLaplacian = cv2.Laplacian(tokenGrayBlur, cv2.CV_64F)
64 |             # adjust contrast and brightness
65 |             tokenGrayBlurLaplacian = np.uint8(np.clip((10 * tokenGrayBlurLaplacian + 10), 0, 255))
66 |             script = pytesseract.image_to_string(tokenGrayBlurLaplacian, lang='eng')
67 |             if len(script) == 0 or script.isspace():
68 |                 continue
69 |             for char in script:  # remove illegitimate characters
70 |                 if char not in self.config.charsAllowed:
71 |                     script = script.replace(char, '')
72 |             logger.info(repr(script))
73 |             scripts.append(script)
74 |         return scripts
75 | 


--------------------------------------------------------------------------------
/comicsocr/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import print_function  # for print() in Python 2
 3 | import os
 4 | import sys
 5 | import argparse
 6 | import configparser
 7 | import logging
 8 | import codecs
 9 | import json
10 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11 | 
12 | import comicsocr.src.api as api
13 | 
14 | 
15 | def main(argv):
16 |     '''
17 |     Main function that enables formatting file from command-line.
18 | 
19 |     Parameters
20 |     argv: list
21 |         List of arguments in sys.argv, excluding the first argument which is the script itself.
22 |     '''
23 |     args = get_arguments(argv)
24 |     config = args['config']
25 |     paths = args['paths']
26 |     outputPath = args['output_path']
27 |     if config:
28 |         for path in paths:
29 |             name, extension = os.path.splitext(path)
30 |             if os.path.isdir(path):
31 |                 api.read_from_directory(directory=path, outputPath=outputPath, config=config)
32 |             elif extension in api.IMAGE_EXTENSIONS:
33 |                 api.read_from_file(imagePath=path, outputPath=outputPath, config=config)
34 |             elif extension in api.ARCHIVE_EXTENSIONS:
35 |                 api.read_from_archive_file(path=path, outputPath=outputPath, config=config)
36 |     else:
37 |         for path in paths:
38 |             name, extension = os.path.splitext(path)
39 |             if os.path.isdir(path):
40 |                 api.read_from_directory(directory=path, outputPath=outputPath)
41 |             elif extension in api.IMAGE_EXTENSIONS:
42 |                 api.read_from_file(imagePath=path, outputPath=outputPath)
43 |             elif extension in api.ARCHIVE_EXTENSIONS:
44 |                 api.read_from_archive_file(path=path, outputPath=outputPath)
45 | 
46 | 
47 | def get_arguments(argv):
48 |     '''
49 |     Get arguments passed via command-line in dictionary.
50 | 
51 |     Paramters:
52 |     argv: list
53 |         List of arguments in sys.argv, including the first argument which is the script itself.
54 |     
55 |     Returns: dict
56 |         A dictionary containing arguments for the formatter.
57 |     '''
58 |     parser = argparse.ArgumentParser(description='Tool to extract scripts from comic pages.')
59 | 
60 |     parser.add_argument('--paths',
61 |                         type=str,
62 |                         nargs='+',
63 |                         help='''
64 |                         Paths to comic image files, archive files or directories containing comic image files. 
65 |                         Supported file formats (Windows and Unix): .jpg, .png, .bmp, .tiff.
66 |                         Supported archive file formats (Unix only): .rar, .cbr, .zip.
67 |                         ''')
68 | 
69 |     parser.add_argument('--output-path',
70 |                         type=str,
71 |                         help='Path to write the comic scripts to. Recommended format is .csv.')
72 | 
73 |     parser.add_argument('--config', type=str, default=None, help="Configurations.")
74 | 
75 |     args = vars(parser.parse_args(argv[1:]))
76 | 
77 |     return args
78 | 
79 | 
80 | def run_main():
81 |     '''
82 |     Entry point for console_scripts in setup.py
83 |     '''
84 |     main(sys.argv)
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     run_main()
89 | 


--------------------------------------------------------------------------------
/comicsocr/src/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | from matplotlib import pyplot as plt
 4 | import os
 5 | import csv
 6 | import imutils
 7 | 
 8 | from comicsocr.src.config import Config
 9 | 
10 | 
11 | class Tokenizer:
12 |     '''
13 |     Class for finding comic speech bubbles.
14 |     '''
15 |     def __init__(self, config=Config()):
16 |         '''
17 |         Parameters
18 |         speechBubbleSize: dict
19 |             Length and width ranges for the speech bubbles. 
20 |             Default to {'h': [25, 500], 'w': [60, 500]}.
21 |         method: string
22 |             Config.SIMPLE - recognizes only rectangular bubbles.
23 |             Config.COMPLEX - recognizes more complex bubble shapes.
24 |         '''
25 |         self.config = config
26 | 
27 |     def tokenize(self, imagePath):
28 |         '''
29 |         Find all speech bubbles in the given comic image file.
30 | 
31 |         Parameters
32 |         imagePath: string
33 |             Path to the comic page image.
34 |         show: boolean
35 |             If true, will show contour rectangles detected while running.
36 |             Note: May not be available in Python's interactive terminal and may require special handling to show on Unix systems.
37 |         
38 |         Return: list
39 |             Cropped speech bubbles (with possible false positives).
40 |         '''
41 |         image = cv2.imread(imagePath)  # read image
42 |         imageGray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # gray scale
43 |         imageGrayBlur = cv2.GaussianBlur(imageGray, (3, 3), 0)  # filter noise
44 |         if self.config.method == Config.SIMPLE:
45 |             # recognizes only rectangular bubbles
46 |             binary = cv2.threshold(imageGrayBlur, 235, 255, cv2.THRESH_BINARY)[1]
47 |         else:
48 |             # recognizes more complex bubble shapes
49 |             imageGrayBlurCanny = cv2.Canny(imageGrayBlur, 50, 500)
50 |             binary = cv2.threshold(imageGrayBlurCanny, 235, 255, cv2.THRESH_BINARY)[1]
51 |         # find contours
52 |         contourResult = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
53 |         contours = contourResult[1] if imutils.is_cv3() else contourResult[0]
54 |         # get the list of cropped speech bubbles
55 |         croppedImageList = []
56 |         for contour in contours:
57 |             rect = cv2.boundingRect(contour)
58 |             [x, y, w, h] = rect
59 |             # filter out speech bubble candidates with unreasonable size
60 |             if ((w >= self.config.speechBubbleSize['width'][0] and w <= self.config.speechBubbleSize['width'][1]) and
61 |                 (h >= self.config.speechBubbleSize['height'][0] and h <= self.config.speechBubbleSize['height'][1])):
62 |                 if self.config.show:
63 |                     # add the contour rectangle detected in green color to image
64 |                     cv2.rectangle(image, (x, y), (w + x, h + y), (0, 255, 0), 2)
65 |                 croppedImage = image[y:y + h, x:x + w]
66 |                 croppedImageList.append(croppedImage)
67 |         if self.config.show:
68 |             # view all contour rectangles that are detected
69 |             image = Tokenizer.resize(image=image,
70 |                                      width=self.config.showWindowSize.get('width'),
71 |                                      height=self.config.showWindowSize.get('height'))
72 |             cv2.imshow("window", image)
73 |             cv2.waitKey(0)
74 |             cv2.destroyAllWindows()
75 | 
76 |         return croppedImageList
77 | 
78 |     @staticmethod
79 |     def resize(image, width=None, height=None):
80 |         (h, w) = image.shape[:2]  # current height and width
81 | 
82 |         if width is None and height is None:
83 |             return image
84 |         else:
85 |             if width is None:  # resize by height
86 |                 ratio = height / h
87 |                 dim = (int(w * ratio), height)
88 |             else:  # resize by width
89 |                 ratio = width / w
90 |                 dim = (width, int(h * ratio))
91 |             return cv2.resize(image, dim)
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # comics-ocr
  2 | Tool for extracting script from comic pages using OCR engine Tesseract. Inspired by motion comic [Rewind's last message](https://www.youtube.com/watch?v=1LBFR90f6rg) (or alternative link [here](https://www.bilibili.com/video/av2786047)). Useful for making something like [page 18~19 of The Transformers: More than Meets the Eye #16](https://www.transformers.kiev.ua/index.php?pageid=idw) (or alternative link in Chinese [here](http://www.tfg2.com/read.php?tid-45122.html)). 
  3 | 
  4 | Supports image file formats `.jpg`, `.png`. `.bmp`, `.tiff` formats on Windows and Unix systems. Supports archive file formats `.rar`, `.cbr`, `.zip` on Unix systems. The OCR engine Tesseract that is used is not trained, but can be if needed.
  5 | 
  6 | - [comics-ocr](#comics-ocr)
  7 | - [Prerequisites](#prerequisites)
  8 | - [Installation](#installation)
  9 | - [Compatibility](#compatibility)
 10 | - [Usage](#usage)
 11 |   - [Using as command-line tool](#using-as-command-line-tool)
 12 |   - [Using as Python library](#using-as-python-library)
 13 |   - [Configurations](#configurations)
 14 | 
 15 | # Prerequisites
 16 | * [Tesseract](https://github.com/tesseract-ocr/tessdoc/blob/master/Home.md)
 17 | * [patool](https://github.com/wummel/patool)
 18 | * [opencv-python](https://pypi.org/project/opencv-python/)
 19 | 
 20 | # Installation
 21 | 
 22 | ```
 23 | python setup.py install
 24 | ```
 25 | 
 26 | # Compatibility
 27 | Supports Python 2.7 and 3.6+.
 28 | 
 29 | 
 30 | # Usage
 31 | See [here](https://largecats.github.io/blog/2019/06/20/ocr-with-comics/) for more detailed example (using a simplified version of the tool).
 32 | ## Using as command-line tool
 33 | ```
 34 | usage: comicsocr [-h] [--paths PATHS [PATHS ...]] [--output-path OUTPUT_PATH] [--config CONFIG]
 35 | 
 36 | Tool to extract scripts from comic pages.
 37 | 
 38 | optional arguments:
 39 |   -h, --help            show this help message and exit
 40 |   --paths PATHS [PATHS ...]
 41 |                         Paths to comic image files, archive files or directories containing comic image files. Supported file formats (Windows and Unix):
 42 |                         .jpg, .png, .bmp, .tiff. Supported archive file formats (Unix only): .rar, .cbr, .zip.
 43 |   --output-path OUTPUT_PATH
 44 |                         Path to write the comic scripts to.
 45 |   --config CONFIG       Configurations.
 46 | ```
 47 | E.g.,
 48 | ```
 49 | [2020-07-20 22:47:58,252] INFO [api.py:54:read_from_file] Reading from file: C:\Users\largecats\Fun\programming\personal-projects\comics-ocr\test\test.jpg
 50 | [2020-07-20 22:47:59,299] INFO [reader.py:72:read] 'a ela a'
 51 | [2020-07-20 22:48:02,704] INFO [reader.py:72:read] 'THE LAW GAYS THISSORT OF THING HAS TOBE DECLARED ON-SITE.FORMALITIES.'
 52 | [2020-07-20 22:48:04,556] INFO [reader.py:72:read] "I DON'T UNDERSTAND WHYWE HAVE TO BE HERE. CAN'TWE FUST... PUSH A BUTTONAND BE DONE VUITH IT?"
 53 | [2020-07-20 22:48:05,359] INFO [reader.py:72:read] 'MINING OUTPOST C-12.'
 54 | [2020-07-20 22:48:06,166] INFO [reader.py:72:read] 'LONG AGO. PEACETIME.'
 55 | [2020-07-20 22:48:07,025] INFO [reader.py:72:read] 'THE CYBERTRON SYSTEM.Zs'
 56 | [2020-07-20 22:48:10,287] INFO [reader.py:72:read] 'Pinto d 3 ABO adieSoa an eee'
 57 | [2020-07-20 22:48:10,288] INFO [api.py:74:write_to_file] Writing to: C:\Users\largecats\Fun\programming\personal-projects\comics-ocr\test\result.txt
 58 | ```
 59 | 
 60 | ## Using as Python library
 61 | Call `api.read_from_file`, `api.read_from_archive_file`, or `api.read_from_directory` to read from a single image file, a single archive file, or a directory containing image files or archive files of images.
 62 | 
 63 | E.g.,
 64 | ```
 65 | >>> from comicsocr import api
 66 | >>> api.read_from_file(imagePath=r'C:\Users\largecats\Fun\programming\personal-projects\comics-ocr\test\test.jpg')
 67 | [2020-07-20 23:15:35,071] INFO [api.py:54:read_from_file] Reading from file: C:\Users\largecats\Fun\programming\personal-projects\comics-ocr\test\test.jpg
 68 | [2020-07-20 23:15:36,128] INFO [reader.py:72:read] 'a ela a'
 69 | [2020-07-20 23:15:39,436] INFO [reader.py:72:read] 'THE LAW GAYS THISSORT OF THING HAS TOBE DECLARED ON-SITE.FORMALITIES.'
 70 | [2020-07-20 23:15:41,286] INFO [reader.py:72:read] "I DON'T UNDERSTAND WHYWE HAVE TO BE HERE. CAN'TWE FUST... PUSH A BUTTONAND BE DONE VUITH IT?"
 71 | [2020-07-20 23:15:42,058] INFO [reader.py:72:read] 'MINING OUTPOST C-12.'
 72 | [2020-07-20 23:15:42,867] INFO [reader.py:72:read] 'LONG AGO. PEACETIME.'
 73 | [2020-07-20 23:15:43,761] INFO [reader.py:72:read] 'THE CYBERTRON SYSTEM.Zs'
 74 | [2020-07-20 23:15:47,045] INFO [reader.py:72:read] 'Pinto d 3 ABO adieSoa an eee'
 75 | ['a ela a', 'THE LAW GAYS THISSORT OF THING HAS TOBE DECLARED ON-SITE.FORMALITIES.', "I DON'T UNDERSTAND WHYWE HAVE TO BE HERE. CAN'TWE FUST... PUSH A BUTTONAND BE DONE VUITH IT?", 'MINING OUTPOST C-12.', 'LONG AGO. PEACETIME.', 'THE CYBERTRON SYSTEM.Zs', 'Pinto d 3 ABO adieSoa an eee']
 76 | ```
 77 | 
 78 | ## Configurations
 79 | ```
 80 | Help on class Config in module comicsocr.src.config:
 81 | 
 82 | class Config(builtins.object)
 83 |  |  Class for configurations.
 84 |  |
 85 |  |  Methods defined here:
 86 |  |
 87 |  |  __init__(self, speechBubbleSize={'width': [60, 500], 'height': [25, 500]}, show=False, showWindowSize={'height': 768}, charsAllowed=' -QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm,.?!1234567890"":;\'', method=None)
 88 |  |      Parameters
 89 |  |      speechBubbleSize: dict
 90 |  |          Height and width ranges for the speech bubbles.
 91 |  |          Default to {'width': [60, 500],'height': [25, 500]}.
 92 |  |      charsAllowed: string
 93 |  |          Legitimate characters when reading from image.
 94 |  |          Default to ' -QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm,.?!1234567890"":;''.
 95 |  |      method: string
 96 |  |          Config.SIMPLE - recognizes only rectangular bubbles.
 97 |  |          Config.COMPLEX - recognizes more complex bubble shapes.
 98 |  |          Default to Config.SIMPLE.
 99 |  |      show: boolean
100 |  |          If True, will show the image being processed with recognized contours.
101 |  |          Note: This feature may require special handling on Unix systems.
102 |  |          Default to False.
103 |  |      showWindowSize: dict
104 |  |          Size of the window when displaying the image being processed.
105 |  |          E.g., {'height': 768} means scale the image to height 768 with the same aspect ratio.
106 |  |          Default to {'height': 768}.
107 | ```


--------------------------------------------------------------------------------
/comicsocr/src/api.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import csv
  4 | import configparser
  5 | import ast
  6 | import patoolib
  7 | import logging
  8 | import subprocess
  9 | import ntpath
 10 | 
 11 | from comicsocr.src.config import Config, DEFAULT_CONFIG_SECTION
 12 | from comicsocr.src.tokenizer import Tokenizer
 13 | from comicsocr.src.reader import Reader
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | log_formatter = '[%(asctime)s] %(levelname)s [%(filename)s:%(lineno)s:%(funcName)s] %(message)s'
 17 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_formatter)
 18 | 
 19 | IMAGE_EXTENSIONS = ['.jpg', '.png', '.bmp', '.tiff']
 20 | ARCHIVE_EXTENSIONS = ['.rar', '.cbr', '.zip']
 21 | 
 22 | 
 23 | def read_from_file(imagePath, outputPath=None, config=Config()):
 24 |     '''
 25 |     Extract script from given comic image file.
 26 | 
 27 |     Parameters
 28 |     imagePath: string
 29 |         Path to the image file. 
 30 |         Supported formats: ['.jpg', '.png', '.bmp', '.tiff']
 31 |     outputPath: string or None
 32 |         If not None, will write the extracted script to this path.
 33 |         Else, will return the extracted script.
 34 |     config: comicsocr.src.config.Config() object
 35 |         Configurations.
 36 |     
 37 |     Return: None or list
 38 |         If outputPath is not None, return the list of extracted scripts.
 39 |     '''
 40 |     fileName, fileExten = os.path.splitext(imagePath)
 41 |     if fileExten not in IMAGE_EXTENSIONS:
 42 |         raise Exception('Unsupported image file format: ' + fileExten)
 43 |     if type(config) == type(Config()):  # config is a Config() object
 44 |         reader = Reader(config=config)
 45 |     else:  # create Config() object from config
 46 |         if type(config) == str:
 47 |             if config.startswith('{'):  # config is a dictionary in string
 48 |                 config = eval(config)
 49 |                 reader = Reader(config=_create_config_from_dict(configDict=config))
 50 |         elif type(config) == dict:  # config is a dictionary
 51 |             reader = Reader(config=_create_config_from_dict(configDict=config))
 52 |         else:
 53 |             raise Exception('Unsupported config type')
 54 |     logger.info('Reading from file: ' + imagePath)
 55 |     script = reader.read(imagePath=imagePath)
 56 |     if outputPath:
 57 |         write_to_file(imagePath=imagePath, script=script, outputPath=outputPath)
 58 |     else:
 59 |         return script
 60 | 
 61 | 
 62 | def write_to_file(imagePath, script, outputPath):
 63 |     '''
 64 |     Write file path and extracted comic script to given output path.
 65 | 
 66 |     Parameters
 67 |     imagePath: string
 68 |         Path to the image file. Recommended format is .csv.
 69 |     script: list
 70 |         List of extract scripts.
 71 |     outputPath: string
 72 |         Path to write to.
 73 |     '''
 74 |     logger.info('Writing to: ' + outputPath)
 75 |     with open(outputPath, 'a', encoding="utf-8", newline="") as f:
 76 |         writer = csv.writer(f)
 77 |         for line in script:
 78 |             newRow = [imagePath, line]
 79 |             writer.writerow(newRow)
 80 | 
 81 | 
 82 | def read_from_archive_file(path, outputPath=None, config=Config()):
 83 |     '''
 84 |     Extract script from all image files in given archive file.
 85 | 
 86 |     Paramters
 87 |     path: string
 88 |         Path to the archive file.
 89 |         Supported formats: ['.rar', '.cbr', '.zip']
 90 |     outputPath: string or None
 91 |         If not None, will write the extracted script to this path.
 92 |         Else, will return the extracted script.
 93 |     config: comicsocr.src.config.Config() object
 94 |         Configurations.
 95 |     
 96 |     Return: None or dict
 97 |         If outputPath is not None, return a dictionary of comic page paths and scripts extracted from each page.
 98 |     '''
 99 |     fileName, fileExten = os.path.splitext(path)
100 |     if fileExten not in ARCHIVE_EXTENSIONS:
101 |         raise Exception('Unsupported archive file format: ' + fileExten)
102 |     logger.info('Reading from archive file: ' + path)
103 |     patoolib.test_archive(path, verbosity=1)  # test integrity of archive
104 |     parentDir = os.path.dirname(path)
105 |     tempDir = os.path.join(parentDir, 'tmp')
106 |     logger.info('Extracting from ' + path + ' to ' + tempDir)
107 |     patoolib.list_archive(path)
108 |     subprocess.call('mkdir -p "%s"' % tempDir, shell=True)  # create temporary directory
109 |     patoolib.extract_archive(path, outdir=tempDir)  # extract archive files to temporary directory
110 |     results = read_from_directory(directory=tempDir, config=config)
111 |     logger.info('Removing temporary directory: ' + tempDir)
112 |     subprocess.call('rm -rf "%s"' % tempDir, shell=True)  # remove temporary directory
113 |     if outputPath:
114 |         for imageTempPath, script in results.items():
115 |             imagePath = path + '/' + _get_file_name(path=imageTempPath)
116 |             write_to_file(imagePath=imagePath, script=script, outputPath=outputPath)
117 |     else:
118 |         return results
119 | 
120 | 
121 | def _get_file_name(path):
122 |     '''
123 |     Get the base name of given file.
124 | 
125 |     Parameters
126 |     path: string
127 |         Path to file.
128 |     '''
129 |     # from https://stackoverflow.com/questions/8384737/extract-file-name-from-path-no-matter-what-the-os-path-format
130 |     head, tail = ntpath.split(path)
131 |     return tail or ntpath.basename(head)
132 | 
133 | 
134 | def read_from_directory(directory, outputPath=None, config=Config()):
135 |     '''
136 |     Read script from all image files or archive files of images in given directory recursively.
137 | 
138 |     Parameters
139 |     directory: string
140 |         Directory to read from.
141 |     outputPath: string or None
142 |         If not None, will write the extracted script to this path.
143 |         Else, will return the extracted script.
144 |     config: comicsocr.src.config.Config() object
145 |         Configurations.
146 |     
147 |     Return: None or dict
148 |         If outputPath is not None, return a dictionary of comic page paths and scripts extracted from each page.
149 |     '''
150 |     logger.info('Reading from directory: ' + directory)
151 |     results = {}
152 |     for subDir, dirs, files in os.walk(directory):
153 |         for file in files:
154 |             fileName, fileExten = os.path.splitext(file)
155 |             imagePath = os.path.join(subDir, file)
156 |             if fileExten in IMAGE_EXTENSIONS:
157 |                 script = read_from_file(imagePath=imagePath, outputPath=outputPath, config=config)
158 |                 results[imagePath] = script
159 |             elif fileExten in ARCHIVE_EXTENSIONS:
160 |                 archiveFileResults = read_from_archive_file(path=imagePath, outputPath=outputPath, config=config)
161 |                 results.update(archiveFileResults)
162 |     if outputPath:
163 |         for imagePath, script in results.items():
164 |             write_to_file(imagePath=imagePath, script=script, outputPath=outputPath)
165 |     else:
166 |         return results
167 | 
168 | 
169 | def _create_config_from_dict(configDict, defaultConfigSection=DEFAULT_CONFIG_SECTION):
170 |     '''
171 |     Create Config() object from dictionary.
172 | 
173 |     Parameters
174 |     configDict: dict
175 |         A dictionary of configurations specified in key-value pairs.
176 |     defaultConfigSection: string
177 |         The top-level config section that needs to be added on top of the configDict before feeding to configParser.read_dict(), default to 'sparksqlformatter'.
178 |     
179 |     Return: sparksqlformatter.src.config.Config() object
180 |         The Config() object created from configDict.
181 |     '''
182 |     configParser = configparser.ConfigParser()
183 |     configParser.optionxform = str  # makes the parser case-sensitive
184 |     if defaultConfigSection not in configDict:
185 |         configDict = {defaultConfigSection: configDict}  # add top-level section
186 |     configParser.read_dict(configDict)  # configParser assumes the existence of a top-level section
187 |     args = _parse_args_in_correct_type(configParser, defaultConfigSection)
188 |     config = Config(**args)
189 |     return config
190 | 
191 | 
192 | def _parse_args_in_correct_type(configParser, defaultConfigSection=DEFAULT_CONFIG_SECTION):
193 |     '''
194 |     Parse paramters in config with special handling to convert boolean values converted from string back to boolean type.
195 | 
196 |     Parameters
197 |     configParser: a configParser.ConfigParser() object
198 |         Parser for config files.
199 |     defaultConfigSection: string
200 |         The top-level config section that needs to be added on top of the configDict before feeding to configParser.read_dict(), default to 'sparksqlformatter'.
201 |     
202 |     Return: dict
203 |         A dictionary of configuration key-value pairs where values are of correct type.
204 |     '''
205 |     args = {}
206 |     for key in configParser[defaultConfigSection]:
207 |         args[key] = ast.literal_eval(configParser[defaultConfigSection][key])
208 |     return args
209 | 


--------------------------------------------------------------------------------