├── .gitignore
├── .pre-commit-config.yaml
├── README.md
├── __init__.py
├── assets
├── icon.svg
└── icon_light.svg
├── fiftyone.yml
├── ocr_engine.py
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.cython*
3 | *.egg
4 | *.pyc
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/asottile/blacken-docs
3 | rev: v1.12.0
4 | hooks:
5 | - id: blacken-docs
6 | additional_dependencies: [black==21.12b0]
7 | args: ["-l 79"]
8 | exclude: index.umd.js
9 | - repo: https://github.com/ambv/black
10 | rev: 22.3.0
11 | hooks:
12 | - id: black
13 | language_version: python3
14 | args: ["-l 79"]
15 | exclude: index.umd.js
16 | - repo: local
17 | hooks:
18 | - id: pylint
19 | name: pylint
20 | language: system
21 | files: \.py$
22 | entry: pylint
23 | args: ["--errors-only"]
24 | exclude: index.umd.js
25 | - repo: local
26 | hooks:
27 | - id: ipynb-strip
28 | name: ipynb-strip
29 | language: system
30 | files: \.ipynb$
31 | entry: jupyter nbconvert --clear-output --ClearOutputPreprocessor.enabled=True
32 | args: ["--log-level=ERROR"]
33 | - repo: https://github.com/pre-commit/mirrors-prettier
34 | rev: v2.6.2
35 | hooks:
36 | - id: prettier
37 | exclude: index.umd.js
38 | language_version: system
39 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## PyTesseract Optical Character Recognition Plugin
2 |
3 |
4 |
5 | ### Updates
6 |
7 | - **2023-10-19**: Added support for customizing prediction fields, and embedded field for OCR text.
8 |
9 | This plugin is a Python plugin that allows you to perform optical character
10 | recognition on documents using PyTesseract — the Python bindings for the
11 | Tesseract OCR engine!
12 |
13 | ## Watch On Youtube
14 | [](https://www.youtube.com/watch?v=jnNPGrM6Wr4&list=PLuREAXoPgT0RZrUaT0UpX_HzwKkoB-S9j&index=6)
15 |
16 | ## Installation
17 |
18 | ```shell
19 | fiftyone plugins download https://github.com/jacobmarks/pytesseract-ocr-plugin
20 | ```
21 |
22 | You will also need to install the plugin's requirements:
23 |
24 | ```shell
25 | pip install -r requirements.txt
26 | ```
27 |
28 | ## Operators
29 |
30 | ### `run_ocr_engine`
31 |
32 | - Runs the PyTesseract OCR engine on the documents in the dataset, converts the
33 | results to FiftyOne labels, and stores individual word predictions as well
34 | as block-level predictions on the dataset.
35 |
36 | ## Usage
37 |
38 | You can access the operator via the App's action menu, or by pressing the "`"
39 | key on your keyboard and selecting the operator from the dropdown menu.
40 |
41 | If you have a view loaded and/or samples selected, the operator will give you
42 | the option to run the OCR engine on only those samples or on the entire dataset.
43 |
44 | You can either choose to run the operator in the foreground, or to delegate the
45 | execution of the operator to a background job.
46 |
47 | 
48 |
49 | 💡 Once you've generated OCR predictions, you can search through them using the [Keyword Search plugin](https://github.com/jacobmarks/keyword-search-plugin)!
50 |
51 |
52 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | """PyTesseract OCR plugin.
2 |
3 | | Copyright 2017-2023, Voxel51, Inc.
4 | | `voxel51.com `_
5 | |
6 | """
7 |
8 | import os
9 |
10 | import fiftyone as fo
11 | from fiftyone.core.utils import add_sys_path
12 | import fiftyone.operators as foo
13 | from fiftyone.operators import types
14 |
15 |
16 | def _execution_mode(ctx, inputs):
17 | delegate = ctx.params.get("delegate", False)
18 |
19 | if delegate:
20 | description = "Uncheck this box to execute the operation immediately"
21 | else:
22 | description = "Check this box to delegate execution of this task"
23 |
24 | inputs.bool(
25 | "delegate",
26 | default=False,
27 | required=True,
28 | label="Delegate execution?",
29 | description=description,
30 | view=types.CheckboxView(),
31 | )
32 |
33 | if delegate:
34 | inputs.view(
35 | "notice",
36 | types.Notice(
37 | label=(
38 | "You've chosen delegated execution. Note that you must "
39 | "have a delegated operation service running in order for "
40 | "this task to be processed. See "
41 | "https://docs.voxel51.com/plugins/index.html#operators "
42 | "for more information"
43 | )
44 | ),
45 | )
46 |
47 |
48 | def _handle_prediction_fields(ctx, inputs):
49 | obj = types.Object()
50 | obj.bool(
51 | "store_word_preds",
52 | label="Store word predictions?",
53 | default=False,
54 | view=types.SwitchView(space=4),
55 | )
56 | obj.bool(
57 | "store_block_preds",
58 | label="Store block predictions?",
59 | default=True,
60 | view=types.SwitchView(space=4),
61 | )
62 | inputs.define_property("prediction_field_types", obj)
63 |
64 | pfts = ctx.params.get("prediction_field_types", {})
65 | store_word_preds = pfts.get("store_word_preds", False)
66 | store_block_preds = pfts.get("store_block_preds", True)
67 |
68 | if not store_word_preds and not store_block_preds:
69 | inputs.view(
70 | "warning",
71 | types.Warning(
72 | label="Not storing any predictions",
73 | description=(
74 | "You have chosen not to store any predictions. "
75 | "You must store at least one type of prediction to "
76 | "use the `OCR` operation. "
77 | ),
78 | ),
79 | )
80 |
81 | if store_word_preds:
82 | inputs.str(
83 | "word_predictions_field",
84 | label="Word predictions field",
85 | default="pt_word_predictions",
86 | description="The field in which to store the word predictions",
87 | required=True,
88 | )
89 |
90 | if store_block_preds:
91 | inputs.str(
92 | "block_predictions_field",
93 | label="Block predictions field",
94 | default="pt_block_predictions",
95 | description="The field in which to store the block predictions",
96 | required=True,
97 | )
98 |
99 | inputs.bool(
100 | "store_text_as_labels",
101 | label="Store text as labels?",
102 | default=True,
103 | view=types.SwitchView(space=4),
104 | )
105 |
106 | if not ctx.params.get("store_text_as_labels", True):
107 | inputs.str(
108 | "text_field",
109 | label="Text field",
110 | default="ocr_text",
111 | description="The embedded field in which to store the OCR text",
112 | required=True,
113 | )
114 |
115 |
116 | def _get_prediction_fields(ctx):
117 | word_preds_field = ctx.params.get("word_predictions_field", None)
118 | block_preds_field = ctx.params.get("block_predictions_field", None)
119 | return word_preds_field, block_preds_field
120 |
121 |
122 | class OCR(foo.Operator):
123 | @property
124 | def config(self):
125 | _config = foo.OperatorConfig(
126 | name="run_ocr_engine",
127 | label="OCR: run optical character recognition on your images",
128 | dynamic=True,
129 | )
130 | _config.icon = "/assets/icon_light.svg"
131 | return _config
132 |
133 | def resolve_delegation(self, ctx):
134 | return ctx.params.get("delegate", False)
135 |
136 | def resolve_placement(self, ctx):
137 | return types.Placement(
138 | types.Places.SAMPLES_GRID_ACTIONS,
139 | types.Button(
140 | label="Detect text in images",
141 | icon="/assets/icon_light.svg",
142 | dark_icon="/assets/icon.svg",
143 | light_icon="/assets/icon_light.svg",
144 | ),
145 | )
146 |
147 | def resolve_input(self, ctx):
148 | inputs = types.Object()
149 | form_view = types.View(
150 | label="OCR",
151 | description=(
152 | "Run optical character recognition on your images to "
153 | "detect text with PyTesseract"
154 | ),
155 | )
156 | _execution_mode(ctx, inputs)
157 | inputs.view_target(ctx)
158 | _handle_prediction_fields(ctx, inputs)
159 | return types.Property(inputs, view=form_view)
160 |
161 | def execute(self, ctx):
162 | dataset = ctx.dataset
163 | dataset.compute_metadata()
164 |
165 | view = ctx.target_view()
166 |
167 | word_preds_field, block_preds_field = _get_prediction_fields(ctx)
168 | text_field = ctx.params.get("text_field", None)
169 |
170 | with add_sys_path(os.path.dirname(os.path.abspath(__file__))):
171 | # pylint: disable=no-name-in-module,import-error
172 | from ocr_engine import get_ocr_detections
173 |
174 | for sample in view.iter_samples(autosave=True):
175 | word_dets, block_dets = get_ocr_detections(
176 | sample, text_field=text_field
177 | )
178 | if word_preds_field:
179 | sample[word_preds_field] = word_dets
180 | if block_preds_field:
181 | sample[block_preds_field] = block_dets
182 | dataset.add_dynamic_sample_fields()
183 | ctx.ops.reload_dataset()
184 |
185 |
186 | def register(plugin):
187 | plugin.register(OCR)
188 |
--------------------------------------------------------------------------------
/assets/icon.svg:
--------------------------------------------------------------------------------
1 |
14 |
--------------------------------------------------------------------------------
/assets/icon_light.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fiftyone.yml:
--------------------------------------------------------------------------------
1 | fiftyone:
2 | version: ">=0.23.7"
3 | name: "@jacobmarks/pytesseract_ocr"
4 | version: "1.0.5"
5 | description: "Run optical character recognition with PyTesseract!"
6 | url: "https://github.com/jacobmarks/pytesseract-ocr-plugin/blob/main/README.md"
7 | operators:
8 | - run_ocr_engine
9 |
--------------------------------------------------------------------------------
/ocr_engine.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from pytesseract import image_to_data, Output
3 |
4 | import fiftyone as fo
5 |
6 |
7 | def get_filepath(sample):
8 | return (
9 | sample.local_path if hasattr(sample, "local_path") else sample.filepath
10 | )
11 |
12 |
13 | def get_ocr_detections(sample, text_field=None):
14 | fp = get_filepath(sample)
15 | preds = image_to_data(fp, output_type=Output.DICT)
16 |
17 | levels = preds["level"]
18 | boxes = preds["left"], preds["top"], preds["width"], preds["height"]
19 | confidences = preds["conf"]
20 | texts = preds["text"]
21 | block_nums = preds["block_num"]
22 | par_nums = preds["par_num"]
23 | line_nums = preds["line_num"]
24 | word_nums = preds["word_num"]
25 |
26 | word_detections = []
27 | blocks = defaultdict(list)
28 |
29 | # Process each word to create word-level detections and
30 | # group by block for block-level detections
31 | n = len(levels)
32 | w, h = sample.metadata.width, sample.metadata.height
33 | for i in range(n):
34 | bbox = boxes[0][i], boxes[1][i], boxes[2][i], boxes[3][i]
35 | normalized_bbox = [bbox[0] / w, bbox[1] / h, bbox[2] / w, bbox[3] / h]
36 | block_num = block_nums[i]
37 |
38 | # Word-level detection
39 | word_det = fo.Detection(
40 | bounding_box=normalized_bbox,
41 | level=levels[i],
42 | paragraph=par_nums[i],
43 | line=line_nums[i],
44 | word=word_nums[i],
45 | block=block_num,
46 | confidence=confidences[i],
47 | )
48 | if text_field:
49 | word_det[text_field] = texts[i]
50 | word_det.label = "text"
51 | else:
52 | word_det.label = texts[i]
53 | word_detections.append(word_det)
54 |
55 | # Group by block for later processing
56 | blocks[block_num].append((normalized_bbox, texts[i]))
57 |
58 | # Create block-level detections
59 | block_detections = []
60 | for block_num, detections in blocks.items():
61 | min_x = min(det[0][0] for det in detections)
62 | min_y = min(det[0][1] for det in detections)
63 | max_x = max(det[0][2] for det in detections)
64 | max_y = max(det[0][3] for det in detections)
65 |
66 | block_bbox = [min_x, min_y, max_x, max_y]
67 | block_text = " ".join(det[1] for det in detections)
68 |
69 | block_det = fo.Detection(
70 | bounding_box=block_bbox,
71 | block=block_num,
72 | )
73 | if text_field:
74 | block_det[text_field] = block_text
75 | block_det.label = "text"
76 | else:
77 | block_det.label = block_text
78 | block_detections.append(block_det)
79 |
80 | word_detections = fo.Detections(detections=word_detections)
81 | block_detections = fo.Detections(detections=block_detections)
82 |
83 | return word_detections, block_detections
84 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pillow
2 | pytesseract
--------------------------------------------------------------------------------