├── comicsocr ├── src │ ├── __init__.py │ ├── config.py │ ├── reader.py │ ├── tokenizer.py │ └── api.py ├── __main__.py └── __init__.py ├── setup.py ├── LICENSE ├── .gitignore └── README.md /comicsocr/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /comicsocr/__main__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Entry point for command-line tool. 4 | ''' 5 | import comicsocr 6 | 7 | comicsocr.run_main() 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open('README.md', 'r') as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name='comicsocr-largecats', # Replace with your own username 8 | version='0.0.0', 9 | author='largecats', 10 | author_email='linfanxiaolinda@outlook.com', 11 | description= 12 | 'A tool for extracting script from comic pages using OCR engine Tesseract.', 13 | long_description=long_description, 14 | long_description_content_type='text/markdown', 15 | url='https://github.com/largecats/comics-ocr', 16 | packages=setuptools.find_packages(), 17 | classifiers=[ 18 | 'Programming Language :: Python', 19 | 'Programming Language :: Python :: 2', 20 | 'Programming Language :: Python :: 2.7', 21 | 'Programming Language :: Python :: 3', 22 | 'Programming Language :: Python :: 3.6', 23 | 'License :: OSI Approved :: MIT License', 24 | 'Operating System :: OS Independent', 25 | ], 26 | entry_points={ 27 | 'console_scripts': ['comicsocr=comicsocr:run_main'], 28 | }) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019-present largecats 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .config/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /comicsocr/src/config.py: -------------------------------------------------------------------------------- 1 | DEFAULT_CONFIG_SECTION = 'comicsocr' 2 | 3 | 4 | class Config: 5 | ''' 6 | Class for configurations. 7 | ''' 8 | SIMPLE = 'simple' 9 | COMPLEX = 'complex' 10 | 11 | def __init__(self, 12 | speechBubbleSize={ 13 | 'width': [60, 14 | 500], 15 | 'height': [25, 16 | 500] 17 | }, 18 | show=False, 19 | showWindowSize={'height': 768}, 20 | charsAllowed=' -QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm,.?!1234567890"":;\'', 21 | method=None): 22 | ''' 23 | Parameters 24 | speechBubbleSize: dict 25 | Height and width ranges for the speech bubbles. 26 | Default to {'width': [60, 500],'height': [25, 500]}. 27 | charsAllowed: string 28 | Legitimate characters when reading from image. 29 | Default to ' -QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm,.?!1234567890"":;\''. 30 | method: string 31 | Config.SIMPLE - recognizes only rectangular bubbles. 32 | Config.COMPLEX - recognizes more complex bubble shapes. 33 | Default to Config.SIMPLE. 34 | show: boolean 35 | If True, will show the image being processed with recognized contours. 36 | Note: May not be available in Python's interactive terminal and may require special handling to show on Unix systems. 37 | Default to False. 38 | showWindowSize: dict 39 | Size of the window when displaying the image being processed. 40 | E.g., {'height': 768} means scale the image to height 768 with the same aspect ratio. 41 | Default to {'height': 768}. 42 | ''' 43 | self.speechBubbleSize = speechBubbleSize 44 | self.charsAllowed = charsAllowed 45 | self.method = method or Config.SIMPLE 46 | self.show = show 47 | self.showWindowSize = showWindowSize 48 | -------------------------------------------------------------------------------- /comicsocr/src/reader.py: -------------------------------------------------------------------------------- 1 | import pytesseract 2 | import cv2 3 | import numpy as np 4 | from matplotlib import pyplot as plt 5 | import logging 6 | import sys 7 | 8 | from comicsocr.src.config import Config 9 | from comicsocr.src.tokenizer import Tokenizer 10 | 11 | logger = logging.getLogger(__name__) 12 | log_formatter = '[%(asctime)s] %(levelname)s [%(filename)s:%(lineno)s:%(funcName)s] %(message)s' 13 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_formatter) 14 | 15 | 16 | class Reader: 17 | ''' 18 | Optical character reader. 19 | ''' 20 | def __init__(self, config=Config()): 21 | ''' 22 | Parameters 23 | config: Config() object 24 | ''' 25 | self.config = config 26 | self.tokenizer = Tokenizer(config=config) 27 | 28 | def denoise(self, image, n): 29 | ''' 30 | Denoise the given image with n iterations. 31 | ''' 32 | for i in range(n): 33 | image = cv2.fastNlMeansDenoisingColored(image) 34 | 35 | return image 36 | 37 | def read(self, imagePath): 38 | ''' 39 | Apply the ocr engine to the given image and return the extracted scripts where illegitimate characters are filtered out. 40 | 41 | Parameters 42 | imagePath: string 43 | Path to the comic page image. 44 | 45 | Return: list 46 | Strings of comic script extracted from the image. 47 | ''' 48 | tokens = self.tokenizer.tokenize(imagePath=imagePath) 49 | scripts = [] 50 | for token in tokens: 51 | # enlarge 52 | token = cv2.resize(token, (0, 0), fx=2, fy=2) 53 | # denoise 54 | token = self.denoise(image=token, n=2) 55 | kernel = np.ones((1, 1), np.uint8) 56 | token = cv2.dilate(token, kernel, iterations=50) 57 | token = cv2.erode(token, kernel, iterations=50) 58 | # turn gray 59 | tokenGray = cv2.cvtColor(token, cv2.COLOR_BGR2GRAY) 60 | # Gaussian filter 61 | tokenGrayBlur = cv2.GaussianBlur(tokenGray, (5, 5), 0) 62 | # edge detection 63 | tokenGrayBlurLaplacian = cv2.Laplacian(tokenGrayBlur, cv2.CV_64F) 64 | # adjust contrast and brightness 65 | tokenGrayBlurLaplacian = np.uint8(np.clip((10 * tokenGrayBlurLaplacian + 10), 0, 255)) 66 | script = pytesseract.image_to_string(tokenGrayBlurLaplacian, lang='eng') 67 | if len(script) == 0 or script.isspace(): 68 | continue 69 | for char in script: # remove illegitimate characters 70 | if char not in self.config.charsAllowed: 71 | script = script.replace(char, '') 72 | logger.info(repr(script)) 73 | scripts.append(script) 74 | return scripts 75 | -------------------------------------------------------------------------------- /comicsocr/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function # for print() in Python 2 3 | import os 4 | import sys 5 | import argparse 6 | import configparser 7 | import logging 8 | import codecs 9 | import json 10 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 11 | 12 | import comicsocr.src.api as api 13 | 14 | 15 | def main(argv): 16 | ''' 17 | Main function that enables formatting file from command-line. 18 | 19 | Parameters 20 | argv: list 21 | List of arguments in sys.argv, excluding the first argument which is the script itself. 22 | ''' 23 | args = get_arguments(argv) 24 | config = args['config'] 25 | paths = args['paths'] 26 | outputPath = args['output_path'] 27 | if config: 28 | for path in paths: 29 | name, extension = os.path.splitext(path) 30 | if os.path.isdir(path): 31 | api.read_from_directory(directory=path, outputPath=outputPath, config=config) 32 | elif extension in api.IMAGE_EXTENSIONS: 33 | api.read_from_file(imagePath=path, outputPath=outputPath, config=config) 34 | elif extension in api.ARCHIVE_EXTENSIONS: 35 | api.read_from_archive_file(path=path, outputPath=outputPath, config=config) 36 | else: 37 | for path in paths: 38 | name, extension = os.path.splitext(path) 39 | if os.path.isdir(path): 40 | api.read_from_directory(directory=path, outputPath=outputPath) 41 | elif extension in api.IMAGE_EXTENSIONS: 42 | api.read_from_file(imagePath=path, outputPath=outputPath) 43 | elif extension in api.ARCHIVE_EXTENSIONS: 44 | api.read_from_archive_file(path=path, outputPath=outputPath) 45 | 46 | 47 | def get_arguments(argv): 48 | ''' 49 | Get arguments passed via command-line in dictionary. 50 | 51 | Paramters: 52 | argv: list 53 | List of arguments in sys.argv, including the first argument which is the script itself. 54 | 55 | Returns: dict 56 | A dictionary containing arguments for the formatter. 57 | ''' 58 | parser = argparse.ArgumentParser(description='Tool to extract scripts from comic pages.') 59 | 60 | parser.add_argument('--paths', 61 | type=str, 62 | nargs='+', 63 | help=''' 64 | Paths to comic image files, archive files or directories containing comic image files. 65 | Supported file formats (Windows and Unix): .jpg, .png, .bmp, .tiff. 66 | Supported archive file formats (Unix only): .rar, .cbr, .zip. 67 | ''') 68 | 69 | parser.add_argument('--output-path', 70 | type=str, 71 | help='Path to write the comic scripts to. Recommended format is .csv.') 72 | 73 | parser.add_argument('--config', type=str, default=None, help="Configurations.") 74 | 75 | args = vars(parser.parse_args(argv[1:])) 76 | 77 | return args 78 | 79 | 80 | def run_main(): 81 | ''' 82 | Entry point for console_scripts in setup.py 83 | ''' 84 | main(sys.argv) 85 | 86 | 87 | if __name__ == '__main__': 88 | run_main() 89 | -------------------------------------------------------------------------------- /comicsocr/src/tokenizer.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from matplotlib import pyplot as plt 4 | import os 5 | import csv 6 | import imutils 7 | 8 | from comicsocr.src.config import Config 9 | 10 | 11 | class Tokenizer: 12 | ''' 13 | Class for finding comic speech bubbles. 14 | ''' 15 | def __init__(self, config=Config()): 16 | ''' 17 | Parameters 18 | speechBubbleSize: dict 19 | Length and width ranges for the speech bubbles. 20 | Default to {'h': [25, 500], 'w': [60, 500]}. 21 | method: string 22 | Config.SIMPLE - recognizes only rectangular bubbles. 23 | Config.COMPLEX - recognizes more complex bubble shapes. 24 | ''' 25 | self.config = config 26 | 27 | def tokenize(self, imagePath): 28 | ''' 29 | Find all speech bubbles in the given comic image file. 30 | 31 | Parameters 32 | imagePath: string 33 | Path to the comic page image. 34 | show: boolean 35 | If true, will show contour rectangles detected while running. 36 | Note: May not be available in Python's interactive terminal and may require special handling to show on Unix systems. 37 | 38 | Return: list 39 | Cropped speech bubbles (with possible false positives). 40 | ''' 41 | image = cv2.imread(imagePath) # read image 42 | imageGray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # gray scale 43 | imageGrayBlur = cv2.GaussianBlur(imageGray, (3, 3), 0) # filter noise 44 | if self.config.method == Config.SIMPLE: 45 | # recognizes only rectangular bubbles 46 | binary = cv2.threshold(imageGrayBlur, 235, 255, cv2.THRESH_BINARY)[1] 47 | else: 48 | # recognizes more complex bubble shapes 49 | imageGrayBlurCanny = cv2.Canny(imageGrayBlur, 50, 500) 50 | binary = cv2.threshold(imageGrayBlurCanny, 235, 255, cv2.THRESH_BINARY)[1] 51 | # find contours 52 | contourResult = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) 53 | contours = contourResult[1] if imutils.is_cv3() else contourResult[0] 54 | # get the list of cropped speech bubbles 55 | croppedImageList = [] 56 | for contour in contours: 57 | rect = cv2.boundingRect(contour) 58 | [x, y, w, h] = rect 59 | # filter out speech bubble candidates with unreasonable size 60 | if ((w >= self.config.speechBubbleSize['width'][0] and w <= self.config.speechBubbleSize['width'][1]) and 61 | (h >= self.config.speechBubbleSize['height'][0] and h <= self.config.speechBubbleSize['height'][1])): 62 | if self.config.show: 63 | # add the contour rectangle detected in green color to image 64 | cv2.rectangle(image, (x, y), (w + x, h + y), (0, 255, 0), 2) 65 | croppedImage = image[y:y + h, x:x + w] 66 | croppedImageList.append(croppedImage) 67 | if self.config.show: 68 | # view all contour rectangles that are detected 69 | image = Tokenizer.resize(image=image, 70 | width=self.config.showWindowSize.get('width'), 71 | height=self.config.showWindowSize.get('height')) 72 | cv2.imshow("window", image) 73 | cv2.waitKey(0) 74 | cv2.destroyAllWindows() 75 | 76 | return croppedImageList 77 | 78 | @staticmethod 79 | def resize(image, width=None, height=None): 80 | (h, w) = image.shape[:2] # current height and width 81 | 82 | if width is None and height is None: 83 | return image 84 | else: 85 | if width is None: # resize by height 86 | ratio = height / h 87 | dim = (int(w * ratio), height) 88 | else: # resize by width 89 | ratio = width / w 90 | dim = (width, int(h * ratio)) 91 | return cv2.resize(image, dim) 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # comics-ocr 2 | Tool for extracting script from comic pages using OCR engine Tesseract. Inspired by motion comic [Rewind's last message](https://www.youtube.com/watch?v=1LBFR90f6rg) (or alternative link [here](https://www.bilibili.com/video/av2786047)). Useful for making something like [page 18~19 of The Transformers: More than Meets the Eye #16](https://www.transformers.kiev.ua/index.php?pageid=idw) (or alternative link in Chinese [here](http://www.tfg2.com/read.php?tid-45122.html)). 3 | 4 | Supports image file formats `.jpg`, `.png`. `.bmp`, `.tiff` formats on Windows and Unix systems. Supports archive file formats `.rar`, `.cbr`, `.zip` on Unix systems. The OCR engine Tesseract that is used is not trained, but can be if needed. 5 | 6 | - [comics-ocr](#comics-ocr) 7 | - [Prerequisites](#prerequisites) 8 | - [Installation](#installation) 9 | - [Compatibility](#compatibility) 10 | - [Usage](#usage) 11 | - [Using as command-line tool](#using-as-command-line-tool) 12 | - [Using as Python library](#using-as-python-library) 13 | - [Configurations](#configurations) 14 | 15 | # Prerequisites 16 | * [Tesseract](https://github.com/tesseract-ocr/tessdoc/blob/master/Home.md) 17 | * [patool](https://github.com/wummel/patool) 18 | * [opencv-python](https://pypi.org/project/opencv-python/) 19 | 20 | # Installation 21 | 22 | ``` 23 | python setup.py install 24 | ``` 25 | 26 | # Compatibility 27 | Supports Python 2.7 and 3.6+. 28 | 29 | 30 | # Usage 31 | See [here](https://largecats.github.io/blog/2019/06/20/ocr-with-comics/) for more detailed example (using a simplified version of the tool). 32 | ## Using as command-line tool 33 | ``` 34 | usage: comicsocr [-h] [--paths PATHS [PATHS ...]] [--output-path OUTPUT_PATH] [--config CONFIG] 35 | 36 | Tool to extract scripts from comic pages. 37 | 38 | optional arguments: 39 | -h, --help show this help message and exit 40 | --paths PATHS [PATHS ...] 41 | Paths to comic image files, archive files or directories containing comic image files. Supported file formats (Windows and Unix): 42 | .jpg, .png, .bmp, .tiff. Supported archive file formats (Unix only): .rar, .cbr, .zip. 43 | --output-path OUTPUT_PATH 44 | Path to write the comic scripts to. 45 | --config CONFIG Configurations. 46 | ``` 47 | E.g., 48 | ``` 49 | [2020-07-20 22:47:58,252] INFO [api.py:54:read_from_file] Reading from file: C:\Users\largecats\Fun\programming\personal-projects\comics-ocr\test\test.jpg 50 | [2020-07-20 22:47:59,299] INFO [reader.py:72:read] 'a ela a' 51 | [2020-07-20 22:48:02,704] INFO [reader.py:72:read] 'THE LAW GAYS THISSORT OF THING HAS TOBE DECLARED ON-SITE.FORMALITIES.' 52 | [2020-07-20 22:48:04,556] INFO [reader.py:72:read] "I DON'T UNDERSTAND WHYWE HAVE TO BE HERE. CAN'TWE FUST... PUSH A BUTTONAND BE DONE VUITH IT?" 53 | [2020-07-20 22:48:05,359] INFO [reader.py:72:read] 'MINING OUTPOST C-12.' 54 | [2020-07-20 22:48:06,166] INFO [reader.py:72:read] 'LONG AGO. PEACETIME.' 55 | [2020-07-20 22:48:07,025] INFO [reader.py:72:read] 'THE CYBERTRON SYSTEM.Zs' 56 | [2020-07-20 22:48:10,287] INFO [reader.py:72:read] 'Pinto d 3 ABO adieSoa an eee' 57 | [2020-07-20 22:48:10,288] INFO [api.py:74:write_to_file] Writing to: C:\Users\largecats\Fun\programming\personal-projects\comics-ocr\test\result.txt 58 | ``` 59 | 60 | ## Using as Python library 61 | Call `api.read_from_file`, `api.read_from_archive_file`, or `api.read_from_directory` to read from a single image file, a single archive file, or a directory containing image files or archive files of images. 62 | 63 | E.g., 64 | ``` 65 | >>> from comicsocr import api 66 | >>> api.read_from_file(imagePath=r'C:\Users\largecats\Fun\programming\personal-projects\comics-ocr\test\test.jpg') 67 | [2020-07-20 23:15:35,071] INFO [api.py:54:read_from_file] Reading from file: C:\Users\largecats\Fun\programming\personal-projects\comics-ocr\test\test.jpg 68 | [2020-07-20 23:15:36,128] INFO [reader.py:72:read] 'a ela a' 69 | [2020-07-20 23:15:39,436] INFO [reader.py:72:read] 'THE LAW GAYS THISSORT OF THING HAS TOBE DECLARED ON-SITE.FORMALITIES.' 70 | [2020-07-20 23:15:41,286] INFO [reader.py:72:read] "I DON'T UNDERSTAND WHYWE HAVE TO BE HERE. CAN'TWE FUST... PUSH A BUTTONAND BE DONE VUITH IT?" 71 | [2020-07-20 23:15:42,058] INFO [reader.py:72:read] 'MINING OUTPOST C-12.' 72 | [2020-07-20 23:15:42,867] INFO [reader.py:72:read] 'LONG AGO. PEACETIME.' 73 | [2020-07-20 23:15:43,761] INFO [reader.py:72:read] 'THE CYBERTRON SYSTEM.Zs' 74 | [2020-07-20 23:15:47,045] INFO [reader.py:72:read] 'Pinto d 3 ABO adieSoa an eee' 75 | ['a ela a', 'THE LAW GAYS THISSORT OF THING HAS TOBE DECLARED ON-SITE.FORMALITIES.', "I DON'T UNDERSTAND WHYWE HAVE TO BE HERE. CAN'TWE FUST... PUSH A BUTTONAND BE DONE VUITH IT?", 'MINING OUTPOST C-12.', 'LONG AGO. PEACETIME.', 'THE CYBERTRON SYSTEM.Zs', 'Pinto d 3 ABO adieSoa an eee'] 76 | ``` 77 | 78 | ## Configurations 79 | ``` 80 | Help on class Config in module comicsocr.src.config: 81 | 82 | class Config(builtins.object) 83 | | Class for configurations. 84 | | 85 | | Methods defined here: 86 | | 87 | | __init__(self, speechBubbleSize={'width': [60, 500], 'height': [25, 500]}, show=False, showWindowSize={'height': 768}, charsAllowed=' -QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm,.?!1234567890"":;\'', method=None) 88 | | Parameters 89 | | speechBubbleSize: dict 90 | | Height and width ranges for the speech bubbles. 91 | | Default to {'width': [60, 500],'height': [25, 500]}. 92 | | charsAllowed: string 93 | | Legitimate characters when reading from image. 94 | | Default to ' -QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm,.?!1234567890"":;''. 95 | | method: string 96 | | Config.SIMPLE - recognizes only rectangular bubbles. 97 | | Config.COMPLEX - recognizes more complex bubble shapes. 98 | | Default to Config.SIMPLE. 99 | | show: boolean 100 | | If True, will show the image being processed with recognized contours. 101 | | Note: This feature may require special handling on Unix systems. 102 | | Default to False. 103 | | showWindowSize: dict 104 | | Size of the window when displaying the image being processed. 105 | | E.g., {'height': 768} means scale the image to height 768 with the same aspect ratio. 106 | | Default to {'height': 768}. 107 | ``` -------------------------------------------------------------------------------- /comicsocr/src/api.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import csv 4 | import configparser 5 | import ast 6 | import patoolib 7 | import logging 8 | import subprocess 9 | import ntpath 10 | 11 | from comicsocr.src.config import Config, DEFAULT_CONFIG_SECTION 12 | from comicsocr.src.tokenizer import Tokenizer 13 | from comicsocr.src.reader import Reader 14 | 15 | logger = logging.getLogger(__name__) 16 | log_formatter = '[%(asctime)s] %(levelname)s [%(filename)s:%(lineno)s:%(funcName)s] %(message)s' 17 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_formatter) 18 | 19 | IMAGE_EXTENSIONS = ['.jpg', '.png', '.bmp', '.tiff'] 20 | ARCHIVE_EXTENSIONS = ['.rar', '.cbr', '.zip'] 21 | 22 | 23 | def read_from_file(imagePath, outputPath=None, config=Config()): 24 | ''' 25 | Extract script from given comic image file. 26 | 27 | Parameters 28 | imagePath: string 29 | Path to the image file. 30 | Supported formats: ['.jpg', '.png', '.bmp', '.tiff'] 31 | outputPath: string or None 32 | If not None, will write the extracted script to this path. 33 | Else, will return the extracted script. 34 | config: comicsocr.src.config.Config() object 35 | Configurations. 36 | 37 | Return: None or list 38 | If outputPath is not None, return the list of extracted scripts. 39 | ''' 40 | fileName, fileExten = os.path.splitext(imagePath) 41 | if fileExten not in IMAGE_EXTENSIONS: 42 | raise Exception('Unsupported image file format: ' + fileExten) 43 | if type(config) == type(Config()): # config is a Config() object 44 | reader = Reader(config=config) 45 | else: # create Config() object from config 46 | if type(config) == str: 47 | if config.startswith('{'): # config is a dictionary in string 48 | config = eval(config) 49 | reader = Reader(config=_create_config_from_dict(configDict=config)) 50 | elif type(config) == dict: # config is a dictionary 51 | reader = Reader(config=_create_config_from_dict(configDict=config)) 52 | else: 53 | raise Exception('Unsupported config type') 54 | logger.info('Reading from file: ' + imagePath) 55 | script = reader.read(imagePath=imagePath) 56 | if outputPath: 57 | write_to_file(imagePath=imagePath, script=script, outputPath=outputPath) 58 | else: 59 | return script 60 | 61 | 62 | def write_to_file(imagePath, script, outputPath): 63 | ''' 64 | Write file path and extracted comic script to given output path. 65 | 66 | Parameters 67 | imagePath: string 68 | Path to the image file. Recommended format is .csv. 69 | script: list 70 | List of extract scripts. 71 | outputPath: string 72 | Path to write to. 73 | ''' 74 | logger.info('Writing to: ' + outputPath) 75 | with open(outputPath, 'a', encoding="utf-8", newline="") as f: 76 | writer = csv.writer(f) 77 | for line in script: 78 | newRow = [imagePath, line] 79 | writer.writerow(newRow) 80 | 81 | 82 | def read_from_archive_file(path, outputPath=None, config=Config()): 83 | ''' 84 | Extract script from all image files in given archive file. 85 | 86 | Paramters 87 | path: string 88 | Path to the archive file. 89 | Supported formats: ['.rar', '.cbr', '.zip'] 90 | outputPath: string or None 91 | If not None, will write the extracted script to this path. 92 | Else, will return the extracted script. 93 | config: comicsocr.src.config.Config() object 94 | Configurations. 95 | 96 | Return: None or dict 97 | If outputPath is not None, return a dictionary of comic page paths and scripts extracted from each page. 98 | ''' 99 | fileName, fileExten = os.path.splitext(path) 100 | if fileExten not in ARCHIVE_EXTENSIONS: 101 | raise Exception('Unsupported archive file format: ' + fileExten) 102 | logger.info('Reading from archive file: ' + path) 103 | patoolib.test_archive(path, verbosity=1) # test integrity of archive 104 | parentDir = os.path.dirname(path) 105 | tempDir = os.path.join(parentDir, 'tmp') 106 | logger.info('Extracting from ' + path + ' to ' + tempDir) 107 | patoolib.list_archive(path) 108 | subprocess.call('mkdir -p "%s"' % tempDir, shell=True) # create temporary directory 109 | patoolib.extract_archive(path, outdir=tempDir) # extract archive files to temporary directory 110 | results = read_from_directory(directory=tempDir, config=config) 111 | logger.info('Removing temporary directory: ' + tempDir) 112 | subprocess.call('rm -rf "%s"' % tempDir, shell=True) # remove temporary directory 113 | if outputPath: 114 | for imageTempPath, script in results.items(): 115 | imagePath = path + '/' + _get_file_name(path=imageTempPath) 116 | write_to_file(imagePath=imagePath, script=script, outputPath=outputPath) 117 | else: 118 | return results 119 | 120 | 121 | def _get_file_name(path): 122 | ''' 123 | Get the base name of given file. 124 | 125 | Parameters 126 | path: string 127 | Path to file. 128 | ''' 129 | # from https://stackoverflow.com/questions/8384737/extract-file-name-from-path-no-matter-what-the-os-path-format 130 | head, tail = ntpath.split(path) 131 | return tail or ntpath.basename(head) 132 | 133 | 134 | def read_from_directory(directory, outputPath=None, config=Config()): 135 | ''' 136 | Read script from all image files or archive files of images in given directory recursively. 137 | 138 | Parameters 139 | directory: string 140 | Directory to read from. 141 | outputPath: string or None 142 | If not None, will write the extracted script to this path. 143 | Else, will return the extracted script. 144 | config: comicsocr.src.config.Config() object 145 | Configurations. 146 | 147 | Return: None or dict 148 | If outputPath is not None, return a dictionary of comic page paths and scripts extracted from each page. 149 | ''' 150 | logger.info('Reading from directory: ' + directory) 151 | results = {} 152 | for subDir, dirs, files in os.walk(directory): 153 | for file in files: 154 | fileName, fileExten = os.path.splitext(file) 155 | imagePath = os.path.join(subDir, file) 156 | if fileExten in IMAGE_EXTENSIONS: 157 | script = read_from_file(imagePath=imagePath, outputPath=outputPath, config=config) 158 | results[imagePath] = script 159 | elif fileExten in ARCHIVE_EXTENSIONS: 160 | archiveFileResults = read_from_archive_file(path=imagePath, outputPath=outputPath, config=config) 161 | results.update(archiveFileResults) 162 | if outputPath: 163 | for imagePath, script in results.items(): 164 | write_to_file(imagePath=imagePath, script=script, outputPath=outputPath) 165 | else: 166 | return results 167 | 168 | 169 | def _create_config_from_dict(configDict, defaultConfigSection=DEFAULT_CONFIG_SECTION): 170 | ''' 171 | Create Config() object from dictionary. 172 | 173 | Parameters 174 | configDict: dict 175 | A dictionary of configurations specified in key-value pairs. 176 | defaultConfigSection: string 177 | The top-level config section that needs to be added on top of the configDict before feeding to configParser.read_dict(), default to 'sparksqlformatter'. 178 | 179 | Return: sparksqlformatter.src.config.Config() object 180 | The Config() object created from configDict. 181 | ''' 182 | configParser = configparser.ConfigParser() 183 | configParser.optionxform = str # makes the parser case-sensitive 184 | if defaultConfigSection not in configDict: 185 | configDict = {defaultConfigSection: configDict} # add top-level section 186 | configParser.read_dict(configDict) # configParser assumes the existence of a top-level section 187 | args = _parse_args_in_correct_type(configParser, defaultConfigSection) 188 | config = Config(**args) 189 | return config 190 | 191 | 192 | def _parse_args_in_correct_type(configParser, defaultConfigSection=DEFAULT_CONFIG_SECTION): 193 | ''' 194 | Parse paramters in config with special handling to convert boolean values converted from string back to boolean type. 195 | 196 | Parameters 197 | configParser: a configParser.ConfigParser() object 198 | Parser for config files. 199 | defaultConfigSection: string 200 | The top-level config section that needs to be added on top of the configDict before feeding to configParser.read_dict(), default to 'sparksqlformatter'. 201 | 202 | Return: dict 203 | A dictionary of configuration key-value pairs where values are of correct type. 204 | ''' 205 | args = {} 206 | for key in configParser[defaultConfigSection]: 207 | args[key] = ast.literal_eval(configParser[defaultConfigSection][key]) 208 | return args 209 | --------------------------------------------------------------------------------