├── .gitignore ├── LICENSE ├── Readme.md ├── my_little_ocr ├── __init__.py ├── base_engine │ ├── __init__.py │ ├── base_ocr_engine.py │ ├── engine_config.py │ └── img_utils.py └── ocr_engines │ ├── __init__.py │ ├── easyocr_engine.py │ ├── rapidocr_engine │ ├── .gitignore │ ├── __init__.py │ └── rapidocr_engine.py │ ├── surya_engine.py │ ├── tesseract_engine │ ├── __init__.py │ ├── install.py │ └── tesseract_engine.py │ └── wechat_ocr_engine │ ├── .gitignore │ ├── __init__.py │ ├── install.py │ ├── wechat_ocr_engine.py │ └── wechat_ocr_modified_lib.py ├── pyproject.toml └── test ├── ocr_images ├── OCR_test_1080_en-US.png ├── OCR_test_1080_ja-jp.png ├── OCR_test_1080_words_en-US.txt ├── OCR_test_1080_words_ja-jp.txt ├── OCR_test_1080_words_zh-Hans-CN.txt └── OCR_test_1080_zh-Hans-CN.png └── test_ocr.py /.gitignore: -------------------------------------------------------------------------------- 1 | runtime/ 2 | settings.toml 3 | poetry.lock 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 114 | .pdm.toml 115 | .pdm-python 116 | .pdm-build/ 117 | 118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 119 | __pypackages__/ 120 | 121 | # Celery stuff 122 | celerybeat-schedule 123 | celerybeat.pid 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Environments 129 | .env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | 155 | # pytype static type analyzer 156 | .pytype/ 157 | 158 | # Cython debug symbols 159 | cython_debug/ 160 | 161 | # PyCharm 162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 164 | # and can be added to the global gitignore or merged into this file. For a more nuclear 165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 166 | #.idea/ 167 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 X-T-E-R 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # MyLittleOCR API Library 2 | 3 | ## Overview 4 | 5 | MyLittleOCR is a unified wrapper for several popular OCR libraries, providing a consistent API that allows developers to seamlessly integrate and switch between different OCR engines without altering their application logic. 6 | 7 | ## Features 8 | 9 | - **Unified API**: A consistent interface for multiple OCR engines, simplifying OCR integration. 10 | - **Multiple OCR Backends**: Supports popular OCR libraries including Tesseract, EasyOCR, PaddleOCR, WeChat OCR, Surya, and RapidOCR. 11 | - **Flexible Switching**: Easily switch between OCR backends based on project requirements. 12 | - **Customizable**: Fine-tune accuracy and performance by adjusting parameters for each OCR engine. 13 | 14 | ## Installation 15 | 16 | Install MyLittleOCR using pip: 17 | 18 | To install the base package: 19 | 20 | ```bash 21 | pip install my_little_ocr 22 | ``` 23 | 24 | To install MyLittleOCR with specific OCR backends, use the following commands: 25 | 26 | - To install all supported OCR backends: 27 | ```bash 28 | pip install my_little_ocr[all] 29 | ``` 30 | - To install with Tesseract support: 31 | ```bash 32 | pip install my_little_ocr[tesseract] 33 | ``` 34 | - To install with EasyOCR support: 35 | ```bash 36 | pip install my_little_ocr[easyocr] 37 | ``` 38 | - To install with WeChat OCR support: 39 | ```bash 40 | pip install my_little_ocr[wechat_ocr] 41 | ``` 42 | - To install with Surya OCR support: 43 | ```bash 44 | pip install my_little_ocr[surya] 45 | ``` 46 | - To install with RapidOCR support: 47 | ```bash 48 | pip install my_little_ocr[rapidocr] 49 | ``` 50 | 51 | 52 | ## Supported OCR Libraries 53 | 54 | Below is a list of supported OCR libraries along with their licenses, project URLs, and usage examples with optional parameters. 55 | 56 | | OCR Engine | License | Project URL | 57 | | ------------ | ---------- | ---------------------------------------------------------- | 58 | | Tesseract | Apache 2.0 | [Tesseract](https://github.com/madmaze/pytesseract) | 59 | | EasyOCR | Apache 2.0 | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | 60 | | WeChat OCR | Unknown | [WeChat OCR](https://github.com/kanadeblisst00/wechat_ocr) | 61 | | Surya | GPL 3.0 | [Surya](https://github.com/VikParuchuri/surya) | 62 | | RapidOCR | Apache 2.0 | [RapidOCR](https://github.com/RapidAI/RapidOCR) | 63 | 64 | ### Tesseract 65 | 66 | - **License**: Apache 2.0 67 | - **Project URL**: [Tesseract](https://github.com/madmaze/pytesseract) 68 | 69 | To install Tesseract, first ensure you have the Tesseract binary installed. You can download it from [here](https://tesseract-ocr.github.io/tessdoc/#binaries). Then install the Python wrapper: 70 | 71 | ```bash 72 | pip install pytesseract 73 | ``` 74 | 75 | #### Usage with Optional Parameters 76 | 77 | The `TesseractEngine` class can be instantiated with the following optional parameters: 78 | 79 | - `tesseract_command`: Path to the Tesseract executable. If not provided, it attempts to find it automatically. 80 | - `default_langs`: A list of language codes or names. Default is `['eng', 'chi_sim']`. 81 | 82 | > **Note**: You can use language names like 'English', 'eng', or 'en'. The program automatically converts them using the `iso639` library. 83 | 84 | ### EasyOCR 85 | 86 | - **License**: Apache 2.0 87 | - **Project URL**: [EasyOCR](https://github.com/JaidedAI/EasyOCR) 88 | 89 | Install EasyOCR using: 90 | 91 | ```bash 92 | pip install easyocr 93 | ``` 94 | 95 | #### Usage with Optional Parameters 96 | 97 | The `EasyOCREngine` class accepts the following optional parameters: 98 | 99 | - `default_langs`: A list of language codes or names. Default is `['ch_sim', 'en']`. 100 | - Additional parameters supported by EasyOCR's `Reader` class (see [EasyOCR Documentation](https://www.jaided.ai/easyocr/documentation/)). 101 | 102 | ### WeChat OCR 103 | 104 | - **License**: Unknown (utilizes components from WeChat, a closed-source project. Use with caution and do not use for commercial purposes. Only supported on Windows.) 105 | - **Project URL**: [WeChat OCR](https://github.com/kanadeblisst00/wechat_ocr) 106 | 107 | Install WeChat OCR using: 108 | 109 | ```bash 110 | pip install wechat_ocr 111 | ``` 112 | 113 | #### Usage 114 | 115 | The `WechatOCREngine` does not require additional optional parameters for instantiation. 116 | 117 | ### Surya 118 | 119 | - **License**: GPL 3.0 120 | - **Project URL**: [Surya](https://github.com/VikParuchuri/surya) 121 | 122 | Install Surya using: 123 | 124 | ```bash 125 | pip install surya-ocr 126 | ``` 127 | 128 | #### Usage with Optional Parameters 129 | 130 | The `SuryaEngine` class can be instantiated with the following optional parameters: 131 | 132 | - `default_langs`: A list of language codes or names. Default is `['en', 'zh', '_math']`. 133 | - Additional parameters can be passed via `**kwargs`. 134 | 135 | ### RapidOCR 136 | 137 | - **License**: Apache 2.0 138 | - **Project URL**: [RapidOCR](https://github.com/RapidAI/RapidOCR) 139 | 140 | Install RapidOCR using: 141 | 142 | ```bash 143 | pip install rapidocr_onnxruntime 144 | ``` 145 | 146 | #### Usage with Optional Parameters 147 | 148 | The `RapidOCREngine` class accepts the following optional parameters: 149 | 150 | - `det_model`: Detection model path or name. Default is `'ch_PP-OCRv4_det_infer.onnx'`. 151 | - `rec_model`: Recognition model path or name. Default is `'ch_PP-OCRv4_rec_infer.onnx'`. 152 | - Additional parameters supported by `RapidOCR` (see [RapidOCR API Documentation](https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/)). 153 | 154 | > **Note**: The models will be automatically downloaded if not present. You can specify custom model paths as needed. 155 | 156 | ## Quick Start 157 | 158 | Here's an example of how to use the MyLittleOCR API to extract text from an image: 159 | 160 | ```python 161 | from my_little_ocr import get_engine_instance 162 | 163 | # Get an instance of the desired OCR engine (e.g., 'tesseract', 'easyocr', 'paddleocr', 'wechat_ocr', 'surya', 'rapidocr') 164 | ocr_engine = get_engine_instance('rapidocr') 165 | 166 | # Extract text from an image 167 | ocr_result = ocr_engine.ocr('/path/to/image.jpg') 168 | 169 | # Convert OCR result to a list and print it 170 | print("OCR Result:", ocr_result.to_list()) 171 | ``` 172 | 173 | ## Ways to Interact with the API 174 | 175 | There are two main ways to interact with the API in MyLittleOCR: 176 | 177 | ### 1. Engine Management-Based API Interaction 178 | 179 | - **Get Engine Instance**: Use `get_engine_instance(engine_name, **kwargs)` to get an instance of a specific OCR engine with optional parameters. 180 | 181 | ```python 182 | from my_little_ocr import get_engine_instance 183 | 184 | engine_instance = get_engine_instance('easyocr') 185 | result = engine_instance.ocr('/path/to/image.jpg') 186 | print(result.to_list()) 187 | ``` 188 | 189 | - **Get Engine Class**: Use `get_engine_class(engine_name)` to get the class of a specific OCR engine. 190 | 191 | ```python 192 | from my_little_ocr import get_engine_class 193 | 194 | EasyOCREngine = get_engine_class('easyocr') 195 | engine_instance = EasyOCREngine() 196 | ``` 197 | - **Get All Engines**: Use `get_all_engines()` to retrieve all registered OCR engines. 198 | 199 | ```python 200 | from my_little_ocr import get_all_engines 201 | 202 | engines = get_all_engines() 203 | for engine_name, engine_class in engines.items(): 204 | print(f"Engine Name: {engine_name}") 205 | engine_instance = engine_class() 206 | result = engine_instance.ocr('/path/to/image.jpg') 207 | print(result.to_list()) 208 | ``` 209 | 210 | 211 | ### 2. Direct Import from Specific Library 212 | 213 | Directly import the engine class from the specific OCR engine module, then instantiate and use it. 214 | 215 | ```python 216 | from my_little_ocr.ocr_engines.easyocr_engine import EasyOCREngine 217 | 218 | engine_instance = EasyOCREngine( 219 | default_langs=['English', 'Korean'], 220 | gpu=False 221 | ) 222 | result = engine_instance.ocr('/path/to/image.jpg') 223 | print(result.to_list()) 224 | ``` 225 | 226 | ## Working with OCR Results 227 | 228 | The `OCRResult` class represents OCR results and provides methods to process and filter them. 229 | 230 | ### Example 231 | 232 | Create an `OCRResult` instance by providing a list of `OCRItem` instances. 233 | 234 | ```python 235 | from my_little_ocr.base_engine.base_ocr_engine import OCRResult, OCRItem 236 | 237 | ocr_items = [ 238 | OCRItem( 239 | text="example", 240 | box=[[0, 0], [1, 0], [1, 1], [0, 1]], 241 | confidence=0.9 242 | ) 243 | ] 244 | ocr_result = OCRResult(ocr_items=ocr_items) 245 | ``` 246 | 247 | ### Filtering Results 248 | 249 | Use `filter_by_confidence(confidence_threshold)` to filter OCR results based on confidence scores. 250 | 251 | ```python 252 | filtered_result = ocr_result.filter_by_confidence(0.8) 253 | ``` 254 | 255 | ### Converting Results to List 256 | 257 | Use `to_list(text_only=False)` to convert OCR results to a list. Set `text_only=True` to retrieve only the text content. 258 | 259 | ```python 260 | text_list = ocr_result.to_list(text_only=True) 261 | full_list = ocr_result.to_list(text_only=False) 262 | ``` 263 | 264 | These methods provide flexible ways to manage and utilize OCR results. 265 | 266 | ## Base OCREngine Class 267 | 268 | Below is the abstract base class for all OCR engines: 269 | 270 | ```python 271 | from abc import ABC, abstractmethod 272 | from typing import Union 273 | from PIL import Image 274 | import numpy as np 275 | 276 | ImageLike = Union[str, bytes, np.ndarray, Image.Image] 277 | 278 | class BaseOCREngine(ABC): 279 | """ 280 | Abstract base class for OCR engines. 281 | """ 282 | 283 | ocr_engine_name: str = "Base OCR Engine" 284 | 285 | @abstractmethod 286 | def ocr(self, img: ImageLike) -> OCRResult: 287 | """ 288 | Performs OCR on the given image. 289 | 290 | Args: 291 | img (ImageLike): The image to perform OCR on. 292 | 293 | Returns: 294 | OCRResult: The OCR result. 295 | """ 296 | pass 297 | ``` 298 | 299 | The input `img` supports multiple formats: 300 | 301 | - File path as a string 302 | - Bytes 303 | - NumPy array (`np.ndarray`) 304 | - PIL Image (`Image.Image`) 305 | 306 | ## Contributing 307 | 308 | Contributions are welcome! If you'd like to add support for more OCR libraries or suggest improvements, feel free to open an issue or submit a pull request. 309 | 310 | ## License 311 | 312 | This project is licensed under the MIT License. 313 | 314 | **Note**: Individual OCR engines may have different licenses. Please refer to their respective project pages for more details. 315 | 316 | ## Acknowledgments 317 | 318 | We would like to thank the following libraries that make this project possible: 319 | 320 | - **Pydantic** 321 | - **GitPython** 322 | - **iso639-lang** 323 | 324 | And all the OCR libraries mentioned above. 325 | 326 | ## Credits 327 | 328 | Thanks to all the contributors and maintainers of the OCR libraries that were used in this project. 329 | 330 | -------------------------------------------------------------------------------- /my_little_ocr/__init__.py: -------------------------------------------------------------------------------- 1 | from .ocr_engines import get_all_engines, get_engine_instance, get_engine_class 2 | from .base_engine.engine_config import register_engine, EngineConfig 3 | from .base_engine.img_utils import ImageLike, convert_imagelike_to_type 4 | from .base_engine.base_ocr_engine import BaseOCREngine, OCRResult, OCRItem 5 | -------------------------------------------------------------------------------- /my_little_ocr/base_engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/my-little-ocr/ca838afc123689d94bd024d45715542e18a74e64/my_little_ocr/base_engine/__init__.py -------------------------------------------------------------------------------- /my_little_ocr/base_engine/base_ocr_engine.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Optional 3 | from pydantic import BaseModel, Field, field_validator 4 | import numpy as np 5 | from .img_utils import ImageLike, convert_imagelike_to_type 6 | import json 7 | 8 | class OCRItem(BaseModel): 9 | """ 10 | Represents an OCR item containing text and its location. 11 | """ 12 | 13 | text: str = Field(..., description="The text content of the OCR item") 14 | # 4 points, each represented as a list of 2 integers 15 | box: Optional[list[list[int]]] = Field( 16 | None, 17 | description="The position of the OCR item in the image, represented as a list of 4 points", 18 | ) 19 | confidence: Optional[float] = Field( 20 | None, description="The confidence score of the OCR item" 21 | ) 22 | 23 | @field_validator("box", mode="before") 24 | def convert_float_to_int(cls, v): 25 | if v is not None: 26 | # Ensure all coordinates are rounded to nearest integer, handling NumPy types as well 27 | return [ 28 | [ 29 | ( 30 | round(coord) 31 | if isinstance(coord, (float, np.floating)) 32 | else int(coord) 33 | ) 34 | for coord in point 35 | ] 36 | for point in v 37 | ] 38 | return v 39 | 40 | def dict(self) -> dict: 41 | """ 42 | Converts the OCR item to a dictionary. 43 | """ 44 | return self.model_dump() 45 | 46 | 47 | class OCRResult(BaseModel): 48 | """ 49 | Represents the result of OCR. 50 | """ 51 | 52 | ocr_items: list[OCRItem] = Field(..., description="The list of OCR items") 53 | default_confidence_threshold: float = Field( 54 | 0.3, description="The default confidence threshold for filtering OCR items" 55 | ) 56 | 57 | def __post_init__(self): 58 | self.ocr_items = [ 59 | item 60 | for item in self.ocr_items 61 | if item.confidence is not None 62 | and item.confidence >= self.default_confidence_threshold 63 | ] 64 | 65 | def filter_by_confidence(self, confidence_threshold: float) -> "OCRResult": 66 | """ 67 | Filters the OCR items by confidence score. 68 | """ 69 | return OCRResult( 70 | ocr_items=[ 71 | item 72 | for item in self.ocr_items 73 | if item.confidence is not None 74 | and item.confidence >= confidence_threshold 75 | ] 76 | ) 77 | 78 | def to_list(self, text_only: bool = False) -> list: 79 | """ 80 | Converts the OCR result to a list of strings. 81 | """ 82 | if text_only: 83 | return [item.text for item in self.ocr_items] 84 | else: 85 | return [item.dict() for item in self.ocr_items] 86 | 87 | def to_string(self, separator: str = " ") -> str: 88 | """ 89 | Converts the OCR result to a string. 90 | """ 91 | return separator.join(self.to_list(text_only=True)) 92 | 93 | def to_json(self, text_only: bool = False, **kwargs) -> str: 94 | """ 95 | Converts the OCR result to a JSON string. 96 | """ 97 | kwargs["ensure_ascii"] = False 98 | return json.dumps( 99 | self.to_list(text_only=text_only), **kwargs 100 | ) 101 | 102 | 103 | class BaseOCREngine(ABC): 104 | """ 105 | Abstract base class for OCR engines. 106 | """ 107 | 108 | ocr_engine_name: str = "Base OCR Engine" 109 | 110 | @abstractmethod 111 | def ocr(self, img: ImageLike) -> OCRResult: 112 | """ 113 | Performs OCR on the given image path synchronously. 114 | 115 | Args: 116 | img (ImageLike): The image to perform OCR on. 117 | 118 | Returns: 119 | Any: The OCR result. 120 | """ 121 | pass 122 | -------------------------------------------------------------------------------- /my_little_ocr/base_engine/engine_config.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from .base_ocr_engine import BaseOCREngine 3 | from typing import Type, Dict, Any, Optional, Callable 4 | 5 | class EngineConfig(BaseModel): 6 | engine_name: str 7 | engine_class: Type[BaseOCREngine] 8 | project_url: Optional[str] = None 9 | 10 | engines: Dict[str, EngineConfig] = {} 11 | 12 | def register_engine(engine_config: EngineConfig): 13 | engines[engine_config.engine_name] = engine_config 14 | 15 | def get_engine(engine_name: str) -> EngineConfig: 16 | return engines[engine_name] 17 | -------------------------------------------------------------------------------- /my_little_ocr/base_engine/img_utils.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Literal, Union 3 | from os import PathLike 4 | from PIL import Image 5 | import numpy as np 6 | import cv2 7 | import tempfile 8 | from io import BytesIO 9 | 10 | ImageLike = Union[str, bytes, np.ndarray, Image.Image, PathLike] 11 | 12 | def convert_imagelike_to_type(img: ImageLike, type: Literal["filepath", "numpy", "pil"]) -> Union[str, np.ndarray, Image.Image]: 13 | """ 14 | Converts the input image-like object to the specified type. 15 | 16 | Args: 17 | img (ImageLike): The image-like object to convert. 18 | type (Literal["filepath", "numpy", "pil"]): The type to convert to. 19 | 20 | Returns: 21 | Union[str, np.ndarray, Image.Image]: The converted image. 22 | """ 23 | # If the desired type is 'filepath' 24 | if type == "filepath": 25 | if isinstance(img, (str, PathLike)): 26 | # Already a filepath 27 | return str(img) 28 | else: 29 | # Create a temporary file to save the image 30 | with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp_file: 31 | filepath = tmp_file.name 32 | if isinstance(img, Image.Image): 33 | img.save(filepath) 34 | elif isinstance(img, np.ndarray): 35 | # Save NumPy array using OpenCV 36 | cv2.imwrite(filepath, img) 37 | elif isinstance(img, bytes): 38 | # Convert bytes to PIL Image and save 39 | pil_img = Image.open(BytesIO(img)) 40 | pil_img.save(filepath) 41 | else: 42 | raise TypeError("Unsupported image type for conversion to filepath.") 43 | return filepath 44 | 45 | # If the desired type is 'pil' 46 | elif type == "pil": 47 | if isinstance(img, Image.Image): 48 | return img # Already a PIL image 49 | elif isinstance(img, np.ndarray): 50 | # Convert from OpenCV format (BGR) to PIL format (RGB) 51 | img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 52 | pil_img = Image.fromarray(img_rgb) 53 | return pil_img 54 | elif isinstance(img, bytes): 55 | return Image.open(BytesIO(img)) 56 | elif isinstance(img, (str, PathLike)): 57 | return Image.open(img) 58 | else: 59 | raise TypeError("Unsupported image type for conversion to PIL image.") 60 | 61 | # If the desired type is 'numpy' (OpenCV format) 62 | elif type == "numpy": 63 | if isinstance(img, np.ndarray): 64 | return img # Already a NumPy array in OpenCV format 65 | elif isinstance(img, Image.Image): 66 | # Convert PIL image to NumPy array in OpenCV format (BGR) 67 | img_rgb = np.array(img) 68 | img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR) 69 | return img_bgr 70 | elif isinstance(img, bytes): 71 | # Decode bytes to NumPy array using OpenCV 72 | nparr = np.frombuffer(img, np.uint8) 73 | img_cv = cv2.imdecode(nparr, cv2.IMREAD_COLOR) 74 | return img_cv 75 | elif isinstance(img, (str, PathLike)): 76 | # Read image from file using OpenCV 77 | img_cv = cv2.imread(str(img)) 78 | return img_cv 79 | else: 80 | raise TypeError("Unsupported image type for conversion to NumPy array.") 81 | 82 | else: 83 | raise ValueError(f"Unknown target type: {type}") -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/__init__.py: -------------------------------------------------------------------------------- 1 | from my_little_ocr.base_engine.engine_config import get_engine as get_engine_after_register 2 | from my_little_ocr.base_engine.base_ocr_engine import BaseOCREngine 3 | import importlib 4 | from typing import Type 5 | from pathlib import Path 6 | 7 | engine_instances :dict[str, BaseOCREngine] = {} 8 | 9 | def deal_with_engine_name(engine_name: str) -> str: 10 | engine_name_with_engine = engine_name if engine_name.endswith("_engine") else engine_name + "_engine" 11 | engine_name = engine_name_with_engine.replace("_engine", "") 12 | return engine_name, engine_name_with_engine 13 | 14 | 15 | def get_engine_class(engine_name: str) -> Type[BaseOCREngine]: 16 | engine_name, engine_name_with_engine = deal_with_engine_name(engine_name) 17 | importlib.import_module(f"my_little_ocr.ocr_engines.{engine_name_with_engine}") 18 | return get_engine_after_register(engine_name).engine_class 19 | 20 | def get_engine_instance(engine_name: str) -> BaseOCREngine: 21 | engine_name, engine_name_with_engine = deal_with_engine_name(engine_name) 22 | engine_class = get_engine_class(engine_name) 23 | if engine_name_with_engine in engine_instances: 24 | return engine_instances[engine_name_with_engine] 25 | engine_instances[engine_name_with_engine] = engine_class() 26 | return engine_instances[engine_name_with_engine] 27 | 28 | def get_all_engines(): 29 | result: dict[str, Type[BaseOCREngine]] = {} 30 | for file_or_folder in Path(__file__).parent.iterdir(): 31 | try: 32 | stem = file_or_folder.stem 33 | if stem.endswith("_engine"): 34 | engine_name = stem[:-7] 35 | engine_class = get_engine_class(engine_name) 36 | result[engine_name] = engine_class 37 | except Exception as e: 38 | print(f"Error: {e}") 39 | return result 40 | 41 | __all__ = ["get_engine_instance", "get_all_engines", "get_engine_class"] 42 | -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/easyocr_engine.py: -------------------------------------------------------------------------------- 1 | from my_little_ocr.base_engine.base_ocr_engine import ( 2 | BaseOCREngine, 3 | OCRItem, 4 | OCRResult, 5 | ImageLike, 6 | convert_imagelike_to_type, 7 | ) 8 | from typing import Literal, Optional, List 9 | import easyocr 10 | 11 | # fmt: off 12 | EASYOCR_LANGS = [ 13 | 'af','az','bs','cs','cy','da','de','en','es','et','fr','ga', 14 | 'hr','hu','id','is','it','ku','la','lt','lv','mi','ms','mt', 15 | 'nl','no','oc','pi','pl','pt','ro','rs_latin','sk','sl','sq', 16 | 'sv','sw','tl','tr','uz','vi','ar','fa','ug','ur','bn','as','mni', 17 | 'ru','rs_cyrillic','be','bg','uk','mn','abq','ady','kbd','ava', 18 | 'dar','inh','che','lbe','lez','tab','tjk','hi','mr','ne','bh','mai', 19 | 'ang','bho','mah','sck','new','gom','sa','bgc','th','ch_sim','ch_tra', 20 | 'ja','ko','ta','te','kn' 21 | ] 22 | # fmt: on 23 | 24 | from iso639 import Lang 25 | 26 | 27 | def convert_langs_to_easyocr_langs(langs: list[str]) -> list[str]: 28 | special_langs = ["ch_sim", "ch_tra", "rs_latin", "rs_cyrillic"] 29 | unknown_langs = set(langs) - set(EASYOCR_LANGS) 30 | known_langs = set(langs) - unknown_langs 31 | special_langs_in_langs = set(langs) & set(special_langs) 32 | 33 | pt1_to_easy_ocr = { 34 | "zh": "ch_sim", 35 | "ab": "abq", 36 | "ce": "che", 37 | "tg": "tjk", 38 | "sr": "rs_latin", 39 | } 40 | result = [] 41 | result.extend(special_langs_in_langs) 42 | for lang in unknown_langs: 43 | lang_pt1 = Lang(lang).pt1 44 | result.append(pt1_to_easy_ocr.get(lang_pt1, lang_pt1)) 45 | result.extend(known_langs) 46 | return result 47 | 48 | 49 | class EasyOCREngine(BaseOCREngine): 50 | ocr_engine_name = "easyocr" 51 | default_langs: list[str] = ["en"] 52 | 53 | def __init__(self, default_langs: list[str] = ["ch_sim", "en"], **kwargs): 54 | self.default_langs = convert_langs_to_easyocr_langs(default_langs) 55 | self.reader = easyocr.Reader(lang_list=self.default_langs, **kwargs) 56 | 57 | def ocr(self, img: ImageLike, **kwargs) -> OCRResult: 58 | img = convert_imagelike_to_type(img, "numpy") 59 | result = self.reader.readtext(img, **kwargs) 60 | return OCRResult(ocr_items=[ 61 | OCRItem(text=item[1], confidence=item[2], box=item[0]) 62 | for item in result 63 | ]) 64 | 65 | 66 | from my_little_ocr.base_engine.engine_config import EngineConfig, register_engine 67 | 68 | engine_config = EngineConfig( 69 | engine_name="easyocr", 70 | engine_class=EasyOCREngine, 71 | project_url="https://github.com/JaidedAI/EasyOCR", 72 | ) 73 | 74 | register_engine(engine_config) 75 | -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/rapidocr_engine/.gitignore: -------------------------------------------------------------------------------- 1 | models/ 2 | -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/rapidocr_engine/__init__.py: -------------------------------------------------------------------------------- 1 | from .rapidocr_engine import * -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/rapidocr_engine/rapidocr_engine.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | from rapidocr_onnxruntime import RapidOCR 3 | from pathlib import Path 4 | from my_little_ocr.base_engine.base_ocr_engine import ( 5 | BaseOCREngine, 6 | OCRItem, 7 | OCRResult, 8 | ImageLike, 9 | convert_imagelike_to_type, 10 | ) 11 | 12 | RECOGNITION_MODELS = Literal[ 13 | "ch_PP-OCRv4_rec_infer.onnx", 14 | "ch_PP-OCRv3_rec_infer.onnx", 15 | "ch_PP-OCRv2_rec_infer.onnx", 16 | "ch_ppocr_server_v2.0_rec_infer.onnx", 17 | "en_PP-OCRv3_rec_infer.onnx", 18 | "en_number_mobile_v2.0_rec_infer.onnx", 19 | # "korean_mobile_v2.0_rec_infer.onnx", 20 | "japan_rec_crnn_v2.onnx", 21 | ] 22 | 23 | DETECTION_MODELS = Literal[ 24 | "ch_PP-OCRv4_det_infer.onnx", 25 | "ch_PP-OCRv3_det_infer.onnx", 26 | "ch_PP-OCRv2_det_infer.onnx", 27 | "ch_ppocr_server_v2.0_det_infer.onnx", 28 | "en_PP-OCRv3_det_infer.onnx", 29 | "en_number_mobile_v2.0_det_infer.onnx", 30 | # "korean_mobile_v2.0_rec_infer.onnx", 31 | "japan_rec_crnn_v2.onnx", 32 | ] 33 | 34 | 35 | def get_model_version_by_name(model_name: str): 36 | model_versions = ["PP-OCRv4", "PP-OCRv3", "PP-OCRv2", "PP-OCRv1"] 37 | for version in model_versions: 38 | if version in model_name: 39 | return version 40 | return "PP-OCRv1" 41 | 42 | 43 | import requests 44 | from tqdm import tqdm 45 | 46 | 47 | def try_download_model(model_name: str) -> Path: 48 | if Path(model_name).exists(): 49 | return Path(model_name) 50 | 51 | # https://huggingface.co/SWHL/RapidOCR/resolve/main/PP-OCRv1/ch_ppocr_mobile_v2.0_det_infer.onnx?download=true 52 | model_folder = get_model_version_by_name(model_name) 53 | model_name = model_name.split("/")[-1] 54 | model_url = f"https://huggingface.co/SWHL/RapidOCR/resolve/main/{model_folder}/{model_name}?download=true" 55 | model_path = Path(__file__).parent / "models" / model_folder / model_name 56 | model_path.parent.mkdir(parents=True, exist_ok=True) 57 | if not model_path.exists(): 58 | response = requests.get(model_url, stream=True) 59 | total_size = int(response.headers.get("content-length", 0)) 60 | block_size = 1024 61 | 62 | with open(model_path, "wb") as file, tqdm( 63 | desc=model_name, 64 | total=total_size, 65 | unit="iB", 66 | unit_scale=True, 67 | unit_divisor=1024, 68 | ) as bar: 69 | for data in response.iter_content(block_size): 70 | file.write(data) 71 | bar.update(len(data)) 72 | 73 | return model_path 74 | 75 | 76 | class RapidOCREngine(BaseOCREngine): 77 | def __init__( 78 | self, 79 | det_model: DETECTION_MODELS = "ch_PP-OCRv4_det_infer.onnx", 80 | rec_model: RECOGNITION_MODELS = "ch_PP-OCRv4_rec_infer.onnx", 81 | **kwargs, 82 | ): 83 | self.engine = RapidOCR( 84 | det_model_path=try_download_model(det_model), 85 | rec_model_path=try_download_model(rec_model), 86 | **kwargs, 87 | ) 88 | 89 | def ocr(self, image: ImageLike): 90 | img = convert_imagelike_to_type(image, type="numpy") 91 | _result, elapse = self.engine(img) 92 | result = [] 93 | for line in _result: 94 | result.append(OCRItem(text=line[1], confidence=line[2], box=line[0])) 95 | return OCRResult(ocr_items=result) 96 | 97 | from my_little_ocr.base_engine.engine_config import EngineConfig, register_engine 98 | 99 | engine_config = EngineConfig( 100 | engine_name="rapidocr", 101 | engine_class=RapidOCREngine, 102 | project_url="https://github.com/RapidAI/RapidOCR" 103 | ) 104 | 105 | 106 | register_engine(engine_config) -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/surya_engine.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from surya.ocr import run_ocr, TextLine, OCRResult as SuryaOCRResult 3 | from surya.model.detection.model import ( 4 | load_model as load_det_model, 5 | load_processor as load_det_processor, 6 | ) 7 | from surya.model.recognition.model import load_model as load_rec_model 8 | from surya.model.recognition.processor import load_processor as load_rec_processor 9 | 10 | from iso639 import Lang 11 | from my_little_ocr.base_engine.base_ocr_engine import ( 12 | BaseOCREngine, 13 | OCRItem, 14 | ImageLike, 15 | OCRResult, 16 | convert_imagelike_to_type, 17 | ) 18 | from typing import Literal, Optional, List 19 | 20 | # fmt: off 21 | SURYA_LANGS = [ 22 | "_math", "en", "zh", "ja", 23 | "af", "am", "ar", "as", "az", "be", "bg", "bn", "br", "bs", "ca", 24 | "cs", "cy", "da", "de", "el", "eo", "es", "et", "eu", "fa", "fi", "fr", 25 | "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "hu", "hy", "id", 26 | "is", "it", "jv", "ka", "kk", "km", "kn", "ko", "ku", "ky", "la", "lo", 27 | "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", 28 | "om", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sa", "sd", "si", "sk", 29 | "sl", "so", "sq", "sr", "su", "sv", "sw", "ta", "te", "th", "tl", "tr", 30 | "ug", "uk", "ur", "uz", "vi", "xh", "yi" 31 | ] 32 | # fmt: on 33 | 34 | 35 | 36 | def convert_langs_to_surya_langs(langs: list[str]) -> list[str]: 37 | known_langs = set(langs) & set(SURYA_LANGS) 38 | unknown_langs = set(langs) - set(SURYA_LANGS) 39 | surya_langs = [Lang(lang).pt1 for lang in unknown_langs] 40 | surya_langs = surya_langs + list(known_langs) 41 | return surya_langs 42 | 43 | 44 | class SuryaEngine(BaseOCREngine): 45 | ocr_engine_name = "surya" 46 | 47 | def __init__(self, default_langs: list[str] = ["en", "zh", "_math"], **kwargs): 48 | self.default_langs = convert_langs_to_surya_langs(default_langs) 49 | self.det_processor, self.det_model = load_det_processor(), load_det_model() 50 | self.rec_model, self.rec_processor = load_rec_model(), load_rec_processor() 51 | 52 | def ocr(self, img: ImageLike, langs: Optional[list[str]] = None) -> list[OCRItem]: 53 | img = convert_imagelike_to_type(img, type="pil") 54 | if langs is None: 55 | langs = self.default_langs 56 | else: 57 | langs = convert_langs_to_surya_langs(langs) 58 | predictions: SuryaOCRResult = run_ocr( 59 | [img], 60 | [langs], 61 | self.det_model, 62 | self.det_processor, 63 | self.rec_model, 64 | self.rec_processor, 65 | ) 66 | result = [] 67 | assert len(predictions) > 0, "No predictions found" 68 | text_lines: List[TextLine] = predictions[0].text_lines 69 | for line in text_lines: 70 | result.append( 71 | OCRItem( 72 | text=line.text, box=line.polygon, confidence=line.confidence 73 | ) 74 | ) 75 | return OCRResult(ocr_items=result) 76 | 77 | 78 | from my_little_ocr.base_engine.engine_config import EngineConfig, register_engine 79 | 80 | engine_config = EngineConfig( 81 | engine_name="surya", 82 | engine_class=SuryaEngine, 83 | project_url="https://github.com/VikParuchuri/surya", 84 | ) 85 | 86 | register_engine(engine_config) 87 | -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/tesseract_engine/__init__.py: -------------------------------------------------------------------------------- 1 | from .tesseract_engine import * -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/tesseract_engine/install.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import subprocess 4 | from pathlib import Path 5 | import urllib.request 6 | 7 | def get_tesseract_command(): 8 | system = platform.system() 9 | if system == "Windows": 10 | path = r"C:\Program Files\Tesseract-OCR\tesseract.exe" 11 | return path 12 | else: 13 | return "tesseract" 14 | 15 | 16 | def check_tesseract_installed(tesseract_command: str) -> bool: 17 | try: 18 | result = subprocess.run([str(tesseract_command), "--version"], check=True, capture_output=True, text=True) 19 | print("Tesseract installed successfully:\n", result.stdout) 20 | except subprocess.CalledProcessError: 21 | print("Tesseract installation failed.") 22 | return False 23 | except Exception as e: 24 | print(f"Error checking Tesseract installation: {e}") 25 | return False 26 | return True 27 | -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/tesseract_engine/tesseract_engine.py: -------------------------------------------------------------------------------- 1 | from my_little_ocr.base_engine.base_ocr_engine import BaseOCREngine, OCRResult, OCRItem, ImageLike, convert_imagelike_to_type 2 | from PIL import Image 3 | import pytesseract 4 | import numpy as np 5 | from iso639 import Lang 6 | from .install import get_tesseract_command, check_tesseract_installed 7 | 8 | # fmt: off 9 | TESSERACT_LANGS = [ 10 | 'afr', 'amh', 'ara', 'asm', 'aze', 'aze_cyrl', 'bel', 'ben', 'bod', 'bos', 'bul', 'cat', 'ceb', 'ces', 'chi_sim', 11 | 'chi_tra', 'chr', 'cym', 'dan', 'deu', 'dzo', 'ell', 'eng', 'enm', 'epo', 'est', 'eus', 'fas', 'fin', 'fra', 'frk', 12 | 'frm', 'gle', 'glg', 'grc', 'guj', 'hat', 'heb', 'hin', 'hrv', 'hun', 'iku', 'ind', 'isl', 'ita', 'ita_old', 'jav', 13 | 'jpn', 'kan', 'kat', 'kat_old', 'kaz', 'khm', 'kir', 'kor', 'kur', 'lao', 'lat', 'lav', 'lit', 'mal', 'mar', 'mkd', 14 | 'mlt', 'msa', 'mya', 'nep', 'nld', 'nor', 'ori', 'pan', 'pol', 'por', 'pus', 'ron', 'rus', 'san', 'sin', 'slk', 15 | 'slv', 'spa', 'spa_old', 'sqi', 'srp', 'srp_latn', 'swa', 'swe', 'syr', 'tam', 'tel', 'tgk', 'tgl', 'tha', 'tir', 16 | 'tur', 'uig', 'ukr', 'urd', 'uzb', 'uzb_cyrl', 'vie', 'yid'] 17 | # fmt: on 18 | 19 | replace_langs = { 20 | "chi": "chi_sim", 21 | } 22 | 23 | def convert_langs_to_tesseract_langs(langs: list[str]) -> list[str]: 24 | known_langs = set(TESSERACT_LANGS) & set(langs) 25 | unknown_langs = set(langs) - set(TESSERACT_LANGS) 26 | result = list(known_langs) 27 | for lang in unknown_langs: 28 | lang_pt2 = Lang(lang).pt2b 29 | result.append(replace_langs.get(lang_pt2, lang_pt2)) 30 | return result 31 | 32 | 33 | class TesseractEngine(BaseOCREngine): 34 | def __init__(self, tesseract_command: str = None, default_langs: list[str] = ["eng", "chi_sim"]): 35 | self.tesseract_command = tesseract_command or get_tesseract_command() 36 | check_tesseract_installed(self.tesseract_command) 37 | pytesseract.pytesseract.tesseract_cmd = self.tesseract_command 38 | self.default_langs = default_langs 39 | 40 | def ocr(self, img: ImageLike, langs: list[str] = None, commands: list[str] = None) -> OCRResult: 41 | pil_img = convert_imagelike_to_type(img, type="filepath") 42 | langs = langs or self.default_langs 43 | commands = commands or [] 44 | langs = convert_langs_to_tesseract_langs(langs) 45 | commands.append(f"-l {'+'.join(langs)}") 46 | text = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT, config=" ".join(commands)) 47 | left_list, top_list, height_list, width_list, conf_list,text_list = text['left'], text['top'], text['height'], text['width'], text['conf'],text['text'] 48 | result = [] 49 | for left, top, height, width, conf,text in zip(left_list, top_list, height_list, width_list, conf_list,text_list): 50 | if conf <0: 51 | continue 52 | confidence = conf / 100 53 | ocr_item = OCRItem( 54 | text=text, 55 | confidence=confidence, 56 | box=[[left, top], [left + width, top], [left + width, top + height], [left, top + height]] 57 | ) 58 | result.append(ocr_item) 59 | return OCRResult(ocr_items=result) 60 | 61 | 62 | from my_little_ocr.base_engine.engine_config import EngineConfig, register_engine 63 | 64 | engine_config = EngineConfig( 65 | engine_name="tesseract", 66 | engine_class=TesseractEngine, 67 | project_url="https://github.com/madmaze/pytesseract" 68 | ) 69 | 70 | register_engine(engine_config) 71 | -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/wechat_ocr_engine/.gitignore: -------------------------------------------------------------------------------- 1 | wxocr-binary/ -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/wechat_ocr_engine/__init__.py: -------------------------------------------------------------------------------- 1 | from my_little_ocr.base_engine.engine_config import EngineConfig, register_engine 2 | from .wechat_ocr_engine import WechatOCREngine 3 | 4 | engine_config = EngineConfig( 5 | engine_name='wechat_ocr', 6 | engine_class=WechatOCREngine, 7 | project_url="https://github.com/kanadeblisst00/wechat_ocr", 8 | ) 9 | 10 | register_engine(engine_config) 11 | -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/wechat_ocr_engine/install.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import git 3 | from pathlib import Path 4 | from git import RemoteProgress 5 | from git.exc import GitCommandError 6 | from tqdm import tqdm 7 | 8 | class ProgressPrinter(RemoteProgress): 9 | def __init__(self): 10 | super().__init__() 11 | self.pbar = None 12 | 13 | def update(self, op_code, cur_count, max_count=None, message=''): 14 | if self.pbar is None and max_count: 15 | self.pbar = tqdm(total=max_count, unit='objects', leave=False) 16 | 17 | if self.pbar: 18 | self.pbar.update(cur_count - self.pbar.n) 19 | 20 | if message: 21 | self.pbar.set_description(f"{message}") 22 | 23 | def close(self): 24 | if self.pbar: 25 | self.pbar.close() 26 | 27 | def check_install(exe_path:str): 28 | return Path(exe_path).exists() 29 | 30 | def install(): 31 | # 获取当前文件所在的目录 32 | current_dir = pathlib.Path(__file__).parent 33 | 34 | exe_path = current_dir / 'wxocr-binary' / 'WeChatOCR.exe' 35 | if check_install(exe_path): 36 | print(f'WeChatOCR.exe 已存在: {exe_path}') 37 | return 38 | 39 | # 定义目标子模块的路径 40 | submodule_path = current_dir / 'wxocr-binary' 41 | 42 | repo_url = 'https://github.com/Antonoko/wxocr-binary' 43 | 44 | progress = ProgressPrinter() 45 | 46 | try: 47 | if submodule_path.exists() and submodule_path.is_dir(): 48 | # 如果目录已经存在,则尝试更新 49 | repo = git.Repo(submodule_path) 50 | print(f'仓库已存在,正在更新: {submodule_path}') 51 | repo.remotes.origin.pull(progress=progress) 52 | progress.close() 53 | else: 54 | # 如果目录不存在,则克隆仓库 55 | print(f'目录不存在,正在克隆仓库到: {submodule_path}') 56 | git.Repo.clone_from(repo_url, submodule_path, progress=progress) 57 | progress.close() 58 | 59 | print(f'操作成功!仓库位于: {submodule_path}') 60 | 61 | except GitCommandError as e: 62 | print(f'Git 操作失败: {e}') 63 | except Exception as e: 64 | print(f'发生错误: {e}') 65 | -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/wechat_ocr_engine/wechat_ocr_engine.py: -------------------------------------------------------------------------------- 1 | from .wechat_ocr_modified_lib import OcrManager 2 | from pydantic import BaseModel, Field, field_validator 3 | from typing import Callable 4 | from my_little_ocr.base_engine.base_ocr_engine import ( 5 | BaseOCREngine, 6 | OCRItem, 7 | convert_imagelike_to_type, 8 | ImageLike, 9 | OCRResult, 10 | ) 11 | from threading import Lock 12 | from concurrent.futures import Future 13 | from pathlib import Path 14 | 15 | from .install import install 16 | install() 17 | 18 | class WechatOCRSettings(BaseModel): 19 | dir: str = Field(..., description="The directory of the WeChat OCR binary") 20 | exe_path: str = Field(..., description="The path to the WeChat OCR executable") 21 | 22 | @field_validator("dir", "exe_path", mode="before") 23 | def convert_to_path(cls, v): 24 | return str(Path(v).resolve()) 25 | 26 | 27 | class WechatOCREngine(BaseOCREngine): 28 | ocr_engine_name = "WeChat OCR" 29 | 30 | def __init__( 31 | self, 32 | dir: str = Path(__file__).parent / "wxocr-binary", 33 | exe_path: str = Path(__file__).parent / "wxocr-binary" / "WeChatOCR.exe", 34 | *args, 35 | **kwargs 36 | ): 37 | self.ocr_settings = WechatOCRSettings(dir=dir, exe_path=exe_path, **kwargs) 38 | 39 | self.ocr_manager: OcrManager = None 40 | self._future_results: dict[str, Future] = {} 41 | self._lock = Lock() 42 | 43 | self.init_wechat_ocr() 44 | 45 | def _wrapper_callback(self, img_path: str, wechat_ocr_results: dict): 46 | ocr_result = wechat_ocr_results["ocrResult"] 47 | ocr_item_list = [] 48 | for item_dict in ocr_result: 49 | location = item_dict["location"] # dict 50 | left, top, right, bottom = ( 51 | location["left"], 52 | location["top"], 53 | location["right"], 54 | location["bottom"], 55 | ) 56 | position = [[left, top], [right, top], [right, bottom], [left, bottom]] 57 | score = item_dict.get("score") 58 | ocr_item = OCRItem( 59 | text=item_dict["text"], box=position, confidence=score 60 | ) 61 | ocr_item_list.append(ocr_item) 62 | 63 | future = self._future_results.get(str(img_path)) 64 | if future: 65 | future.set_result(ocr_item_list) 66 | 67 | def init_wechat_ocr(self): 68 | self.ocr_manager: OcrManager = OcrManager(self.ocr_settings.dir) 69 | self.ocr_manager.SetExePath(self.ocr_settings.exe_path) 70 | self.ocr_manager.SetUsrLibDir(self.ocr_settings.dir) 71 | self.ocr_manager.SetOcrResultCallback(self._wrapper_callback) 72 | self.ocr_manager.StartWeChatOCR() 73 | 74 | def ocr_image_using_callback(self, img_path: str): 75 | self.ocr_manager.DoOCRTask(img_path) 76 | 77 | def ocr(self, img: ImageLike) -> OCRResult: 78 | img_path = convert_imagelike_to_type(img, "filepath") 79 | img_path = str(Path(img_path).resolve()) 80 | future = Future() 81 | with self._lock: 82 | self._future_results[img_path] = future 83 | 84 | print(f"OCR method img_path: {img_path}") # Debug statement 85 | 86 | self.ocr_image_using_callback(img_path) 87 | 88 | result = future.result(timeout=10) 89 | 90 | with self._lock: 91 | del self._future_results[img_path] 92 | 93 | return OCRResult(ocr_items=result) -------------------------------------------------------------------------------- /my_little_ocr/ocr_engines/wechat_ocr_engine/wechat_ocr_modified_lib.py: -------------------------------------------------------------------------------- 1 | # From Lib wechat_ocr.py 2 | import os 3 | import json 4 | import time 5 | import base64 6 | from enum import Enum 7 | from typing import Dict, Callable 8 | from multiprocessing import Queue, Value 9 | from google.protobuf.json_format import MessageToJson 10 | 11 | from wechat_ocr import ocr_protobuf_pb2 12 | from wechat_ocr.winapi import * 13 | from wechat_ocr.mmmojo_dll import MMMojoInfoMethod 14 | from wechat_ocr.xplugin_manager import XPluginManager 15 | 16 | 17 | OCR_MAX_TASK_ID = 32 18 | 19 | class RequestIdOCR(Enum): 20 | OCRPush = 1 21 | 22 | def OCRRemoteOnConnect(is_connected:c_bool, user_data:py_object): 23 | # print(f"OCRRemoteOnConnect 回调函数被调用, 参数, is_connected: {is_connected}") 24 | if user_data: 25 | manager_obj:OcrManager = cast(user_data, py_object).value 26 | manager_obj.SetConnectState(True) 27 | 28 | def OCRRemoteOnDisConnect(user_data:py_object): 29 | # print(f"OCRRemoteOnDisConnect 回调函数被调用 ") 30 | if user_data: 31 | manager_obj:OcrManager = cast(user_data, py_object).value 32 | manager_obj.SetConnectState(False) 33 | 34 | def OCRReadOnPush(request_id:c_uint32, request_info:c_void_p, user_data:py_object): 35 | # print(f"OCRReadOnPush 回调函数被调用 参数, request_id: {request_id}, request_info: {request_info}") 36 | if user_data: 37 | manager_obj:OcrManager = cast(user_data, py_object).value 38 | pb_size = c_uint32() 39 | pb_data = manager_obj.GetPbSerializedData(request_info, pb_size) 40 | if pb_size.value > 10: 41 | print(f"正在解析pb数据,pb数据大小: {pb_size.value}") 42 | manager_obj.CallUsrCallback(request_id, pb_data, pb_size.value) 43 | manager_obj.RemoveReadInfo(request_info) 44 | 45 | 46 | class OcrManager(XPluginManager): 47 | m_task_id = Queue(OCR_MAX_TASK_ID) 48 | m_id_path:Dict[int, str] = {} 49 | m_usr_lib_dir: str = None 50 | m_wechatocr_running: bool = False 51 | m_connect_state = Value('b', False) 52 | m_usr_callback: Callable = None 53 | 54 | def __init__(self, wechat_path) -> None: 55 | super().__init__(wechat_path) 56 | for i in range(1, 33): 57 | self.m_task_id.put(i) 58 | 59 | def __del__(self): 60 | if self.m_wechatocr_running: 61 | self.KillWeChatOCR() 62 | 63 | def SetUsrLibDir(self, usr_lib_dir:str): 64 | self.m_usr_lib_dir = usr_lib_dir 65 | self.AppendSwitchNativeCmdLine("user-lib-dir", usr_lib_dir) 66 | 67 | def SetOcrResultCallback(self, func:Callable): 68 | self.m_usr_callback = func 69 | 70 | def StartWeChatOCR(self): 71 | self.SetCallbackUsrData(self) 72 | self.InitMMMojoEnv() 73 | self.m_wechatocr_running = True 74 | 75 | def KillWeChatOCR(self): 76 | self.m_connect_state.value = False 77 | self.m_wechatocr_running = False 78 | self.StopMMMojoEnv() 79 | 80 | def DoOCRTask(self, pic_path:str): 81 | if not self.m_wechatocr_running: 82 | raise Exception("请先调用StartWeChatOCR启动") 83 | if not os.path.exists(pic_path): 84 | raise Exception(f"给定图片路径pic_path不存在: {pic_path}") 85 | pic_path = os.path.abspath(pic_path) 86 | while not self.m_connect_state.value: 87 | print("等待Ocr服务连接成功!") 88 | time.sleep(1) 89 | _id = self.GetIdleTaskId() 90 | if not _id: 91 | print("当前队列已满,请等待后重试") 92 | return 93 | self.SendOCRTask(_id, pic_path) 94 | 95 | def SetConnectState(self, connect:bool): 96 | self.m_connect_state.value = connect 97 | 98 | def SendOCRTask(self, task_id:int, pic_path:str): 99 | self.m_id_path[task_id] = pic_path 100 | ocr_request = ocr_protobuf_pb2.OcrRequest() 101 | ocr_request.unknow = 0 102 | ocr_request.task_id = task_id 103 | 104 | pic_paths = ocr_request.pic_path 105 | pic_paths.pic_path.extend([pic_path]) 106 | serialized_data = ocr_request.SerializeToString() 107 | self.SendPbSerializedData(serialized_data, len(serialized_data), MMMojoInfoMethod.kMMPush.value, 0, RequestIdOCR.OCRPush.value) 108 | 109 | def CallUsrCallback(self, request_id:c_uint32, serialized_data: c_void_p, data_size: int): 110 | ocr_response_ubyte = (c_ubyte * data_size).from_address(serialized_data) 111 | ocr_response_array = bytearray(ocr_response_ubyte) 112 | ocr_response = ocr_protobuf_pb2.OcrResponse() 113 | ocr_response.ParseFromString(ocr_response_array) 114 | json_response_str = MessageToJson(ocr_response) 115 | task_id = ocr_response.task_id 116 | if not self.m_id_path.get(task_id): 117 | return 118 | # print(f"收到识别结果, task_id: {task_id}, result: {json_response_str}") 119 | pic_path = self.m_id_path[task_id] 120 | if self.m_usr_callback: 121 | self.m_usr_callback(pic_path, self.parse_json_response(json_response_str)) 122 | self.SetTaskIdIdle(task_id) 123 | 124 | def parse_json_response(self, json_response_str:str): 125 | json_response = json.loads(json_response_str) 126 | results = { 127 | "taskId": json_response["taskId"], 128 | "ocrResult": [] 129 | } 130 | singleResult = json_response.get("ocrResult", {}).get("singleResult") 131 | if not singleResult: 132 | return results 133 | 134 | for i in singleResult: 135 | pos = i.get('singlePos', {}).get('pos') 136 | if isinstance(pos, list) and len(pos) == 1: 137 | pos = pos[0] 138 | text = base64.b64decode(i.get("singleStrUtf8", '')).decode('utf-8') 139 | r = { 140 | "text": text, 141 | "location": { 142 | "left": i.get('left'), 143 | "top": i.get("top"), 144 | "right": i.get('right'), 145 | "bottom": i.get('bottom') 146 | }, 147 | "score": i.get("singleRate"), 148 | } 149 | results["ocrResult"].append(r) 150 | return results 151 | 152 | def GetIdleTaskId(self): 153 | task_id = self.m_task_id.get(timeout=1) 154 | return task_id 155 | 156 | def SetTaskIdIdle(self, _id): 157 | self.m_task_id.put(_id) 158 | 159 | def SetDefaultCallbaks(self): 160 | super().SetOneCallback("kMMRemoteConnect", OCRRemoteOnConnect) 161 | super().SetOneCallback("kMMRemoteDisconnect", OCRRemoteOnDisConnect) 162 | super().SetOneCallback("kMMReadPush", OCRReadOnPush) 163 | super().SetDefaultCallbaks() 164 | 165 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "my_little_ocr" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["X-T-E-R "] 6 | license = "mit" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.10,<3.13" 11 | pydantic = "^2.9.2" 12 | numpy = "<2.0" 13 | gitpython = "^3.1.43" 14 | iso639-lang = "^2.4.2" 15 | opencv-python = "^4.10.0.84" 16 | pytesseract = {version = "^0.3.13", optional = true} 17 | easyocr = {version = "^1.7.2", optional = true} 18 | wechat-ocr = {version = "^0.0.3", optional = true} 19 | surya-ocr = {version = "^0.5.0", optional = true} 20 | rapidocr-onnxruntime = {version = "^1.3.24", optional = true} 21 | 22 | [build-system] 23 | requires = ["poetry-core"] 24 | build-backend = "poetry.core.masonry.api" 25 | 26 | [tool.poetry.extras] 27 | tesseract = ["pytesseract"] 28 | easyocr = ["easyocr"] 29 | wechat_ocr = ["wechat_ocr"] 30 | surya = ["surya-ocr"] 31 | rapidocr = ["rapidocr_onnxruntime"] 32 | all = ["pytesseract", "easyocr", "wechat_ocr", "surya-ocr", "rapidocr_onnxruntime"] 33 | -------------------------------------------------------------------------------- /test/ocr_images/OCR_test_1080_en-US.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/my-little-ocr/ca838afc123689d94bd024d45715542e18a74e64/test/ocr_images/OCR_test_1080_en-US.png -------------------------------------------------------------------------------- /test/ocr_images/OCR_test_1080_ja-jp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/my-little-ocr/ca838afc123689d94bd024d45715542e18a74e64/test/ocr_images/OCR_test_1080_ja-jp.png -------------------------------------------------------------------------------- /test/ocr_images/OCR_test_1080_words_en-US.txt: -------------------------------------------------------------------------------- 1 | Philosophy 2 | Historically, many of the individual sciences, like physics and psychology, formed part of philosophy. But they are considered separate academic disciplines in the modern sense of the term. The main traditions in the history of philosophy include Western, Arabic-Persian, Indian, and Chinese philosophy. Western philosophy originated in Ancient Greece and covers a wide area of philosophical subfields. A central topic in Arabic-Persian philosophy is the relation between reason and revelation. Indian philosophy combines the spiritual problem of how to reach enlightenment with the exploration of the nature of reality and the ways of arriving at knowledge. Chinese philosophy focuses on practical issues in relation to right social conduct, government, and self-cultivation. 3 | Semantic satiation 4 | Semantic satiation is a psychological phenomenon in which repetition causes a word or phrase to temporarily lose meaning for the listener, who then perceives the speech as repeated meaningless sounds. Extended inspection or analysis (staring at the word or phrase for a long time) in place of repetition also produces the same effect. -------------------------------------------------------------------------------- /test/ocr_images/OCR_test_1080_words_ja-jp.txt: -------------------------------------------------------------------------------- 1 | 油彩 2 | 油絵(あぶらえ)は、14世紀後半頃、ヨーロッパのネーデルラント地方(現在のオランダ、ベルギー地域)で生まれ、この地方の画家であるファン・アイク兄弟によって15世紀に確立したとされている。 その後、油絵の技術、技法はイタリアへもたらされて、イタリアでさらに発展した。 3 | 絵具は下層の影響を受けるため、絶縁層と描画層との間にしばしば、地塗りをして絵具の発色を良くし描画特性を高める層を設ける。地塗り層は、上層である絵具層からある程度の油分を吸収することで絵具の固着を良くする役割も果たすことから、地塗りは技法の中でも重要な役割を果たす。キャンバスには予め地塗りを施してあるものが市販されているほか、木枠に張られた商品もある。これは便利であるが、本人の要求を満たす適性を備えているとは限らない。購買層の多くは初学者や絵画教室の生徒である。 4 | Semantic satiation 5 | Semantic satiation is a psychological phenomenon in which repetition causes a word or phrase to temporarily lose meaning for the listener, who then perceives the speech as repeated meaningless sounds. Extended inspection or analysis (staring at the word or phrase for a long time) in place of repetition also produces the same effect. -------------------------------------------------------------------------------- /test/ocr_images/OCR_test_1080_words_zh-Hans-CN.txt: -------------------------------------------------------------------------------- 1 | 庞加莱复现定理 2 | 物理学上,庞加莱复现定理(英语:Poincaré recurrence theorem,又译为庞加莱回复定理或庞加莱回归定理)断言,对于某类系统而言,只要经过充分长但有限的时间,一定会到达某个与初始态任意接近的状态(若该系统具连续的状态),或者一定返回初始态本身(若该系统离散)。 3 | 庞加莱复现时间是复现前经过的时长。对于不同的初始态和不同的要求接近的程度,此时间亦不同。定理仅适用于满足某些条件的孤立力学系统,例如所有粒子都必须约束在某个有限体积的范围内。定理可以放在遍历理论、动态系统,或者统计力学的背景中讨论。适用此定理的系统称为守恒系统(与耗散系统相对)。 4 | 定理得名自亨利·庞加莱,其于1890年讨论过此定理1919年,康斯坦丁·卡拉西奥多里利用测度论证明了此定理。 5 | Semantic satiation 6 | Semantic satiation is a psychological phenomenon in which repetition causes a word or phrase to temporarily lose meaning for the listener, who then perceives the speech as repeated meaningless sounds. Extended inspection or analysis (staring at the word or phrase for a long time) in place of repetition also produces the same effect. -------------------------------------------------------------------------------- /test/ocr_images/OCR_test_1080_zh-Hans-CN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-T-E-R/my-little-ocr/ca838afc123689d94bd024d45715542e18a74e64/test/ocr_images/OCR_test_1080_zh-Hans-CN.png -------------------------------------------------------------------------------- /test/test_ocr.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | sys.path.append(Path(__file__).resolve().parents[1].as_posix()) 4 | 5 | from my_little_ocr import get_all_engines 6 | 7 | 8 | for engine_name, engine in get_all_engines().items(): 9 | print(f"Now testing {engine_name}") 10 | engine_instance = engine() 11 | result = engine_instance.ocr(r"test\ocr_images\OCR_test_1080_zh-Hans-CN.png") 12 | print(result.to_list()) 13 | --------------------------------------------------------------------------------