├── .gitignore ├── .pre-commit-config.yaml ├── README.md ├── __init__.py ├── assets ├── icon.svg └── icon_light.svg ├── fiftyone.yml ├── ocr_engine.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.cython* 3 | *.egg 4 | *.pyc -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/asottile/blacken-docs 3 | rev: v1.12.0 4 | hooks: 5 | - id: blacken-docs 6 | additional_dependencies: [black==21.12b0] 7 | args: ["-l 79"] 8 | exclude: index.umd.js 9 | - repo: https://github.com/ambv/black 10 | rev: 22.3.0 11 | hooks: 12 | - id: black 13 | language_version: python3 14 | args: ["-l 79"] 15 | exclude: index.umd.js 16 | - repo: local 17 | hooks: 18 | - id: pylint 19 | name: pylint 20 | language: system 21 | files: \.py$ 22 | entry: pylint 23 | args: ["--errors-only"] 24 | exclude: index.umd.js 25 | - repo: local 26 | hooks: 27 | - id: ipynb-strip 28 | name: ipynb-strip 29 | language: system 30 | files: \.ipynb$ 31 | entry: jupyter nbconvert --clear-output --ClearOutputPreprocessor.enabled=True 32 | args: ["--log-level=ERROR"] 33 | - repo: https://github.com/pre-commit/mirrors-prettier 34 | rev: v2.6.2 35 | hooks: 36 | - id: prettier 37 | exclude: index.umd.js 38 | language_version: system 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## PyTesseract Optical Character Recognition Plugin 2 | 3 | funsd_predictions 4 | 5 | ### Updates 6 | 7 | - **2023-10-19**: Added support for customizing prediction fields, and embedded field for OCR text. 8 | 9 | This plugin is a Python plugin that allows you to perform optical character 10 | recognition on documents using PyTesseract — the Python bindings for the 11 | Tesseract OCR engine! 12 | 13 | ## Watch On Youtube 14 | [![Video Thumbnail](https://img.youtube.com/vi/jnNPGrM6Wr4/0.jpg)](https://www.youtube.com/watch?v=jnNPGrM6Wr4&list=PLuREAXoPgT0RZrUaT0UpX_HzwKkoB-S9j&index=6) 15 | 16 | ## Installation 17 | 18 | ```shell 19 | fiftyone plugins download https://github.com/jacobmarks/pytesseract-ocr-plugin 20 | ``` 21 | 22 | You will also need to install the plugin's requirements: 23 | 24 | ```shell 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | ## Operators 29 | 30 | ### `run_ocr_engine` 31 | 32 | - Runs the PyTesseract OCR engine on the documents in the dataset, converts the 33 | results to FiftyOne labels, and stores individual word predictions as well 34 | as block-level predictions on the dataset. 35 | 36 | ## Usage 37 | 38 | You can access the operator via the App's action menu, or by pressing the "`" 39 | key on your keyboard and selecting the operator from the dropdown menu. 40 | 41 | If you have a view loaded and/or samples selected, the operator will give you 42 | the option to run the OCR engine on only those samples or on the entire dataset. 43 | 44 | You can either choose to run the operator in the foreground, or to delegate the 45 | execution of the operator to a background job. 46 | 47 | ![ocr_queue_job](https://github.com/jacobmarks/pytesseract-ocr-plugin/assets/12500356/2ab239c1-8d37-44a7-b8d6-93285afe7f08) 48 | 49 | 💡 Once you've generated OCR predictions, you can search through them using the [Keyword Search plugin](https://github.com/jacobmarks/keyword-search-plugin)! 50 | 51 | funsd_block_predictions 52 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | """PyTesseract OCR plugin. 2 | 3 | | Copyright 2017-2023, Voxel51, Inc. 4 | | `voxel51.com `_ 5 | | 6 | """ 7 | 8 | import os 9 | 10 | import fiftyone as fo 11 | from fiftyone.core.utils import add_sys_path 12 | import fiftyone.operators as foo 13 | from fiftyone.operators import types 14 | 15 | 16 | def _execution_mode(ctx, inputs): 17 | delegate = ctx.params.get("delegate", False) 18 | 19 | if delegate: 20 | description = "Uncheck this box to execute the operation immediately" 21 | else: 22 | description = "Check this box to delegate execution of this task" 23 | 24 | inputs.bool( 25 | "delegate", 26 | default=False, 27 | required=True, 28 | label="Delegate execution?", 29 | description=description, 30 | view=types.CheckboxView(), 31 | ) 32 | 33 | if delegate: 34 | inputs.view( 35 | "notice", 36 | types.Notice( 37 | label=( 38 | "You've chosen delegated execution. Note that you must " 39 | "have a delegated operation service running in order for " 40 | "this task to be processed. See " 41 | "https://docs.voxel51.com/plugins/index.html#operators " 42 | "for more information" 43 | ) 44 | ), 45 | ) 46 | 47 | 48 | def _handle_prediction_fields(ctx, inputs): 49 | obj = types.Object() 50 | obj.bool( 51 | "store_word_preds", 52 | label="Store word predictions?", 53 | default=False, 54 | view=types.SwitchView(space=4), 55 | ) 56 | obj.bool( 57 | "store_block_preds", 58 | label="Store block predictions?", 59 | default=True, 60 | view=types.SwitchView(space=4), 61 | ) 62 | inputs.define_property("prediction_field_types", obj) 63 | 64 | pfts = ctx.params.get("prediction_field_types", {}) 65 | store_word_preds = pfts.get("store_word_preds", False) 66 | store_block_preds = pfts.get("store_block_preds", True) 67 | 68 | if not store_word_preds and not store_block_preds: 69 | inputs.view( 70 | "warning", 71 | types.Warning( 72 | label="Not storing any predictions", 73 | description=( 74 | "You have chosen not to store any predictions. " 75 | "You must store at least one type of prediction to " 76 | "use the `OCR` operation. " 77 | ), 78 | ), 79 | ) 80 | 81 | if store_word_preds: 82 | inputs.str( 83 | "word_predictions_field", 84 | label="Word predictions field", 85 | default="pt_word_predictions", 86 | description="The field in which to store the word predictions", 87 | required=True, 88 | ) 89 | 90 | if store_block_preds: 91 | inputs.str( 92 | "block_predictions_field", 93 | label="Block predictions field", 94 | default="pt_block_predictions", 95 | description="The field in which to store the block predictions", 96 | required=True, 97 | ) 98 | 99 | inputs.bool( 100 | "store_text_as_labels", 101 | label="Store text as labels?", 102 | default=True, 103 | view=types.SwitchView(space=4), 104 | ) 105 | 106 | if not ctx.params.get("store_text_as_labels", True): 107 | inputs.str( 108 | "text_field", 109 | label="Text field", 110 | default="ocr_text", 111 | description="The embedded field in which to store the OCR text", 112 | required=True, 113 | ) 114 | 115 | 116 | def _get_prediction_fields(ctx): 117 | word_preds_field = ctx.params.get("word_predictions_field", None) 118 | block_preds_field = ctx.params.get("block_predictions_field", None) 119 | return word_preds_field, block_preds_field 120 | 121 | 122 | class OCR(foo.Operator): 123 | @property 124 | def config(self): 125 | _config = foo.OperatorConfig( 126 | name="run_ocr_engine", 127 | label="OCR: run optical character recognition on your images", 128 | dynamic=True, 129 | ) 130 | _config.icon = "/assets/icon_light.svg" 131 | return _config 132 | 133 | def resolve_delegation(self, ctx): 134 | return ctx.params.get("delegate", False) 135 | 136 | def resolve_placement(self, ctx): 137 | return types.Placement( 138 | types.Places.SAMPLES_GRID_ACTIONS, 139 | types.Button( 140 | label="Detect text in images", 141 | icon="/assets/icon_light.svg", 142 | dark_icon="/assets/icon.svg", 143 | light_icon="/assets/icon_light.svg", 144 | ), 145 | ) 146 | 147 | def resolve_input(self, ctx): 148 | inputs = types.Object() 149 | form_view = types.View( 150 | label="OCR", 151 | description=( 152 | "Run optical character recognition on your images to " 153 | "detect text with PyTesseract" 154 | ), 155 | ) 156 | _execution_mode(ctx, inputs) 157 | inputs.view_target(ctx) 158 | _handle_prediction_fields(ctx, inputs) 159 | return types.Property(inputs, view=form_view) 160 | 161 | def execute(self, ctx): 162 | dataset = ctx.dataset 163 | dataset.compute_metadata() 164 | 165 | view = ctx.target_view() 166 | 167 | word_preds_field, block_preds_field = _get_prediction_fields(ctx) 168 | text_field = ctx.params.get("text_field", None) 169 | 170 | with add_sys_path(os.path.dirname(os.path.abspath(__file__))): 171 | # pylint: disable=no-name-in-module,import-error 172 | from ocr_engine import get_ocr_detections 173 | 174 | for sample in view.iter_samples(autosave=True): 175 | word_dets, block_dets = get_ocr_detections( 176 | sample, text_field=text_field 177 | ) 178 | if word_preds_field: 179 | sample[word_preds_field] = word_dets 180 | if block_preds_field: 181 | sample[block_preds_field] = block_dets 182 | dataset.add_dynamic_sample_fields() 183 | ctx.ops.reload_dataset() 184 | 185 | 186 | def register(plugin): 187 | plugin.register(OCR) 188 | -------------------------------------------------------------------------------- /assets/icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /assets/icon_light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /fiftyone.yml: -------------------------------------------------------------------------------- 1 | fiftyone: 2 | version: ">=0.23.7" 3 | name: "@jacobmarks/pytesseract_ocr" 4 | version: "1.0.5" 5 | description: "Run optical character recognition with PyTesseract!" 6 | url: "https://github.com/jacobmarks/pytesseract-ocr-plugin/blob/main/README.md" 7 | operators: 8 | - run_ocr_engine 9 | -------------------------------------------------------------------------------- /ocr_engine.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from pytesseract import image_to_data, Output 3 | 4 | import fiftyone as fo 5 | 6 | 7 | def get_filepath(sample): 8 | return ( 9 | sample.local_path if hasattr(sample, "local_path") else sample.filepath 10 | ) 11 | 12 | 13 | def get_ocr_detections(sample, text_field=None): 14 | fp = get_filepath(sample) 15 | preds = image_to_data(fp, output_type=Output.DICT) 16 | 17 | levels = preds["level"] 18 | boxes = preds["left"], preds["top"], preds["width"], preds["height"] 19 | confidences = preds["conf"] 20 | texts = preds["text"] 21 | block_nums = preds["block_num"] 22 | par_nums = preds["par_num"] 23 | line_nums = preds["line_num"] 24 | word_nums = preds["word_num"] 25 | 26 | word_detections = [] 27 | blocks = defaultdict(list) 28 | 29 | # Process each word to create word-level detections and 30 | # group by block for block-level detections 31 | n = len(levels) 32 | w, h = sample.metadata.width, sample.metadata.height 33 | for i in range(n): 34 | bbox = boxes[0][i], boxes[1][i], boxes[2][i], boxes[3][i] 35 | normalized_bbox = [bbox[0] / w, bbox[1] / h, bbox[2] / w, bbox[3] / h] 36 | block_num = block_nums[i] 37 | 38 | # Word-level detection 39 | word_det = fo.Detection( 40 | bounding_box=normalized_bbox, 41 | level=levels[i], 42 | paragraph=par_nums[i], 43 | line=line_nums[i], 44 | word=word_nums[i], 45 | block=block_num, 46 | confidence=confidences[i], 47 | ) 48 | if text_field: 49 | word_det[text_field] = texts[i] 50 | word_det.label = "text" 51 | else: 52 | word_det.label = texts[i] 53 | word_detections.append(word_det) 54 | 55 | # Group by block for later processing 56 | blocks[block_num].append((normalized_bbox, texts[i])) 57 | 58 | # Create block-level detections 59 | block_detections = [] 60 | for block_num, detections in blocks.items(): 61 | min_x = min(det[0][0] for det in detections) 62 | min_y = min(det[0][1] for det in detections) 63 | max_x = max(det[0][2] for det in detections) 64 | max_y = max(det[0][3] for det in detections) 65 | 66 | block_bbox = [min_x, min_y, max_x, max_y] 67 | block_text = " ".join(det[1] for det in detections) 68 | 69 | block_det = fo.Detection( 70 | bounding_box=block_bbox, 71 | block=block_num, 72 | ) 73 | if text_field: 74 | block_det[text_field] = block_text 75 | block_det.label = "text" 76 | else: 77 | block_det.label = block_text 78 | block_detections.append(block_det) 79 | 80 | word_detections = fo.Detections(detections=word_detections) 81 | block_detections = fo.Detections(detections=block_detections) 82 | 83 | return word_detections, block_detections 84 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pillow 2 | pytesseract --------------------------------------------------------------------------------