├── requirements.txt ├── .gitignore ├── resources └── ocr-demo.gif ├── models.lst ├── setup_ocv_vars.bat ├── setup_ocv_vars.sh ├── CMakeLists.txt ├── utils └── codec.py ├── README.md ├── text_detection_postprocess.cpp ├── postprocess.cpp ├── LICENSE ├── handwritten-japanese-ocr-touch-panel-demo-cpp.py ├── handwritten-japanese-ocr-touch-panel-demo.py └── data └── kondate_nakayosi_char_list.txt /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | numpy -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | intel 2 | *.pyc 3 | *.exp 4 | *.lib 5 | *.pyd 6 | build 7 | -------------------------------------------------------------------------------- /resources/ocr-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yas-sim/handwritten-japanese-ocr/HEAD/resources/ocr-demo.gif -------------------------------------------------------------------------------- /models.lst: -------------------------------------------------------------------------------- 1 | # This file can be used with the --list option of the model downloader. 2 | handwritten-japanese-recognition-0001 3 | text-detection-???? 4 | -------------------------------------------------------------------------------- /setup_ocv_vars.bat: -------------------------------------------------------------------------------- 1 | set OCV_INSTALL_DIR=%INTEL_OPENVINO_DIR%/extras/opencv 2 | set OpenCV_DIR=%OCV_INSTALL_DIR%/cmake 3 | : set LD_LIBRARY_PATH=%OCV_INSTALL_DIR%/lib:%LD_LIBRARY_PATH% 4 | -------------------------------------------------------------------------------- /setup_ocv_vars.sh: -------------------------------------------------------------------------------- 1 | export OCV_INSTALL_DIR=$INTEL_OPENVINO_DIR/extras/opencv 2 | export OpenCV_DIR=$OCV_INSTALL_DIR/cmake 3 | export LD_LIBRARY_PATH=$OCV_INSTALL_DIR/lib:$LD_LIBRARY_PATH 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | project(text_detection_postprocess) 4 | 5 | find_package(InferenceEngine REQUIRED) 6 | find_package(OpenCV 4 REQUIRED COMPONENTS core imgproc highgui) 7 | find_package(PythonInterp 3.6 REQUIRED) 8 | find_package(PythonLibs "${PYTHON_VERSION_STRING}" EXACT REQUIRED) 9 | execute_process( 10 | COMMAND "${PYTHON_EXECUTABLE}" -c "import numpy; print(numpy.get_include())" 11 | OUTPUT_VARIABLE NUMPY_INCLUDE_DIR 12 | OUTPUT_STRIP_TRAILING_WHITESPACE 13 | RESULT_VARIABLE NUMPY_NOT_FOUND 14 | ) 15 | if(NUMPY_NOT_FOUND) 16 | message(FATAL_ERROR "NumPy headers not found") 17 | endif() 18 | 19 | set(CMAKE_CPP_FLAGS "-ggdb -D_DEBUG") 20 | set(CMAKE_CXX_STANDARD 11) 21 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 22 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 23 | 24 | set(target_name text_detection_postprocess) 25 | add_library(${target_name} MODULE text_detection_postprocess.cpp postprocess.cpp) 26 | 27 | target_include_directories(${target_name} PRIVATE src/ ${InferenceEngine_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR}) 28 | target_link_libraries(${target_name} ${PYTHON_LIBRARIES} ${InferenceEngine_LIBRARIES} opencv_core opencv_imgproc opencv_highgui) 29 | set_target_properties(${target_name} PROPERTIES PREFIX "") 30 | if(WIN32) 31 | set_target_properties(${target_name} PROPERTIES SUFFIX ".pyd") 32 | endif() 33 | -------------------------------------------------------------------------------- /utils/codec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class CTCCodec(object): 5 | """ Convert between text-label and text-index """ 6 | def __init__(self, characters): 7 | # characters (str): set of the possible characters. 8 | dict_character = list(characters) 9 | 10 | self.dict = {} 11 | for i, char in enumerate(dict_character): 12 | # NOTE: 0 is reserved for 'blank' token required by CTCLoss 13 | self.dict[char] = i + 1 14 | 15 | # dummy '[blank]' token for CTCLoss (index 0) 16 | self.characters = ['[blank]'] + dict_character 17 | 18 | def decode(self, preds): 19 | """ convert text-index into text-label. """ 20 | texts = [] 21 | index = 0 22 | # Select max probabilty (greedy decoding) then decode index to character 23 | preds_index = np.argmax(preds, 2) 24 | preds_index = preds_index.transpose(1, 0) 25 | preds_index_reshape = preds_index.reshape(-1) 26 | preds_sizes = np.array([preds_index.shape[1]] * preds_index.shape[0]) 27 | 28 | for l in preds_sizes: 29 | t = preds_index_reshape[index:index + l] 30 | 31 | # NOTE: t might be zero size 32 | if t.shape[0] == 0: 33 | continue 34 | 35 | char_list = [] 36 | for i in range(l): 37 | # removing repeated characters and blank. 38 | if t[i] != 0 and (not (i > 0 and t[i - 1] == t[i])): 39 | char_list.append(self.characters[t[i]]) 40 | text = ''.join(char_list) 41 | texts.append(text) 42 | 43 | index += l 44 | 45 | return texts 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Handwritten Japanese Deep Learning Based OCR with Touch Panel Demo 2 | This is a handwritten Japanese OCR demo program based on a sample program from [**Intel(r) Distribution of OpenVINO(tm) Toolkit 2020.2**](https://software.intel.com/en-us/openvino-toolkit) (`handwritten-japanese-recognition.py`) 3 | The demo program has simple UI and you can write Japanese on the screen with the touch panel by your finger tip and try Japanese OCR performance. 4 | The demo uses a pre-trained text-detection DL model (`text-detection-0003`) from Intel(r) [Open Model Zoo](https://github.com/opencv/open_model_zoo) to detect the text regions from the canvas and run a DL based OCR model for those text regions. 5 | 6 | 手書き日本語OCRデモです。[**Intel(r) Distribution of OpenVINO(tm) toolkit 2020.2**](https://software.intel.com/en-us/openvino-toolkit)に付属の`handwritten-japanese-recognition.py`デモを大幅に書き換えています。 7 | 簡単なUIを用意していますのでタッチパネル付きPCがあれば指で字を書いて認識させるデモを行うことが可能です。 8 | Intel(r) [Open Model Zoo](https://github.com/opencv/open_model_zoo)の文字検出DLモデル(`text-detection-0003`)で自動領域識別も行ない、DL-OCRモデルで文字認識を行います。 9 | 10 | [NEW 03-Apr-2022] Added C++ postprocess version. **Text detection postprocess performance improved by ~x20.** (handwritten-japanese-ocr-touch-panel-demo-cpp.py). 11 | [NEW 03-Apr-2022] Migrated OpenVINO API to API 2.0 for both native Python version and C++ postprocess version. (OpenVINO 2022.1 support) 12 | 13 | ![OCR demo](./resources/ocr-demo.gif) 14 | 15 | ### Required DL Models to Run This Demo 16 | 17 | The demo expects the following models in the Intermediate Representation (IR) format: 18 | 19 | * handwritten-japanese-recognition-0001 20 | * text-detection-0003 21 | 22 | You can download those models from OpenVINO [Open Model Zoo](https://github.com/opencv/open_model_zoo). 23 | In the `models.lst` is the list of appropriate models for this demo that can be obtained via `Model downloader`. 24 | Please see more information about `Model downloader` [here](../../../tools/downloader/README.md). 25 | 26 | ## How to Run 27 | 28 | (Assuming you have successfully installed and setup OpenVINO 2020.2. If you haven't, go to the OpenVINO web page and follow the [*Get Started*](https://software.intel.com/en-us/openvino-toolkit/documentation/get-started) guide to do it.) 29 | 30 | ### 1. Install dependencies 31 | The demo depends on: 32 | - opencv-python 33 | - numpy 34 | 35 | To install all the required Python modules you can use: 36 | 37 | ``` sh 38 | (Linux) python3 -m pip install -r requirements.txt 39 | (Win10) python -m pip install -r requirements.txt 40 | ``` 41 | 42 | ### 2. Download DL models from OMZ 43 | Use `Model Downloader` to download the required models. 44 | ``` sh 45 | (Linux/Win10) omz_downloader --list models.lst 46 | ``` 47 | 48 | ### 3. Build C++ postprocess Python module (Optional) 49 | Prerequisites: To build the C++ postprocess Python module, you need to install OpenVINO dev package and OpenCV dev package. OpenVINO and OpenCV must be found by cmake. 50 | Please refer to the [OpenVINO install guide](https://docs.openvino.ai/latest/openvino_docs_install_guides_overview.html) for details. 51 | ```sh 52 | (Linux) 53 | source setup_ocv_vars.sh 54 | ./build.sh 55 | 56 | (Win10) 57 | setup_ocv_vars.bat 58 | build.bat 59 | ``` 60 | 61 | ### 4. Run the demo app 62 | This program doesn't take any command line arguments. All file names and paths are hard coded in the source code. 63 | ``` sh 64 | (Linux) python3 handwritten-japanese-OCR-touch-panel-demo.py 65 | (Win10) python handwritten-japanese-OCR-touch-panel-demo.py 66 | ``` 67 | 68 | Please make sure the following files are placed at the proper location. 69 | ``` 70 | ./ 71 | + handwritten_japanese-OCR-touch-panel-demo.py 72 | + data 73 | | + kondate_nakayosi_char_list.txt 74 | + intel 75 | | + handwritten-japanese-recognition-0001 76 | | | + FP16 77 | | | | + handwritten-japanese-recognition-0001.xml 78 | | | | + handwritten-japanese-recognition-0001.bin 79 | | + text-detection-0003 80 | | | + FP16 81 | | | | + text-detection-0003.xml 82 | | | | + text-detection-0003.bin 83 | ``` 84 | 85 | ## Demo Output 86 | The application uses the terminal to show resulting recognition text strings. 87 | 88 | ## Tested Environment 89 | - Windows 11 x64 90 | - Intel(r) Distribution of OpenVINO(tm) toolkit 2022.1 91 | - Python 3.7 x64 92 | 93 | ## See Also 94 | * [Using Open Model Zoo demos](../../README.md) 95 | * [Model Optimizer](https://docs.openvinotoolkit.org/latest/_docs_MO_DG_Deep_Learning_Model_Optimizer_DevGuide.html) 96 | * [Model Downloader](../../../tools/downloader/README.md) 97 | -------------------------------------------------------------------------------- /text_detection_postprocess.cpp: -------------------------------------------------------------------------------- 1 | #define PY_SSIZE_T_CLEAN 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION 9 | #include "numpy/arrayobject.h" 10 | 11 | #include 12 | #include 13 | 14 | std::vector postProcess( 15 | const float *link_data_pointer, const std::vector &link_shape, float link_conf_threshold, 16 | const float *cls_data_pointer, const std::vector &cls_shape, float cls_conf_threshold, 17 | const int input_w, const int input_h); 18 | 19 | 20 | extern "C" { 21 | 22 | // Bridge function to the actual postprocess function 23 | static PyObject* postprocess_bridge(PyObject* self, PyObject* args) { 24 | PyArrayObject *link_logits, *segm_logits; 25 | int input_w, input_h; 26 | float cls_conf_threshold, link_conf_threshold; 27 | 28 | // Parse parameters 29 | if(!PyArg_ParseTuple(args, 30 | "O!O!iiff", // link_logits, segm_logits, input_w, input_h, link_conf, cls_conf 31 | &PyArray_Type, &link_logits, 32 | &PyArray_Type, &segm_logits, 33 | &input_w, &input_h, // Input image width and height 34 | &link_conf_threshold, &cls_conf_threshold)) { // Argument is 2 Numpy objects 35 | return nullptr; 36 | } 37 | 38 | std::vector link_shape; 39 | size_t l_ndims = PyArray_NDIM(link_logits); // Number of dimensions 40 | npy_intp* l_shape = PyArray_SHAPE(link_logits); // Shape 41 | for(size_t i=0; i(PyArray_DATA(link_logits)); 45 | 46 | std::vector cls_shape; 47 | size_t s_ndims = PyArray_NDIM(segm_logits); // Number of dimensions 48 | npy_intp* s_shape = PyArray_SHAPE(segm_logits); // Shape 49 | for(size_t i=0; i(PyArray_DATA(segm_logits)); 53 | 54 | auto rects = postProcess(link_data_pointer, link_shape, link_conf_threshold, 55 | cls_data_pointer, cls_shape, cls_conf_threshold, 56 | input_w, input_h); 57 | 58 | int out_size = rects.size(); 59 | 60 | // Create a Numpy object to store result 61 | PyObject *output; 62 | PyArray_Descr* descr = PyArray_DescrFromType(NPY_FLOAT32); 63 | std::vector output_shape {out_size, 2+2+1 }; // Shape ( center(2)+size(2)+angle(1) ) 64 | output = PyArray_Zeros(output_shape.size(), output_shape.data(), descr, 0); 65 | float* output_buf = static_cast(PyArray_DATA(reinterpret_cast(output))); // Obtain pointer to the data 66 | 67 | for(size_t i=0; i(postprocess_bridge), METH_VARARGS, "C++ version of postprocess for text detection model (text-detection-0003)"}, 81 | {NULL, NULL, 0, NULL} 82 | }; 83 | 84 | // Module definition table 85 | PyModuleDef text_detection_postprocess_module = { 86 | PyModuleDef_HEAD_INIT, 87 | "text_detection_postprocess", // m_name: Module Name 88 | "C++ version of text_detection postprocess. Supports text-detection-0003 model of OpenVINO OMZ", // m_doc : Docstring for the module 89 | -1, 90 | method_table 91 | }; 92 | 93 | // Initialize and register module function 94 | // Function name must be 'PyInit_'+module name 95 | // This function must be the only *non-static* function in the source code 96 | PyMODINIT_FUNC PyInit_text_detection_postprocess(void) { 97 | import_array(); // Required to receive Numpy object as arguments 98 | if (PyErr_Occurred()) { 99 | return nullptr; 100 | } 101 | return PyModule_Create(&text_detection_postprocess_module); 102 | } 103 | 104 | } // extern "C" 105 | -------------------------------------------------------------------------------- /postprocess.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | // Original source - OpenVINO Open Model Zoo text_detection_demo / text_detection.cpp 10 | 11 | void softmax(std::vector* data) { 12 | auto& rdata = *data; 13 | const size_t last_dim = 2; 14 | for (size_t i = 0; i < rdata.size(); i += last_dim) { 15 | float m = std::max(rdata[i], rdata[i+1]); 16 | rdata[i] = std::exp(rdata[i] - m); 17 | rdata[i + 1] = std::exp(rdata[i + 1] - m); 18 | float s = rdata[i] + rdata[i + 1]; 19 | rdata[i] /= s; 20 | rdata[i + 1] /= s; 21 | } 22 | } 23 | 24 | std::vector transpose4d( 25 | const std::vector& data, const std::vector& shape, const std::vector& axes) 26 | { 27 | if (shape.size() != axes.size()) 28 | throw std::runtime_error("Shape and axes must have the same dimension."); 29 | 30 | for (size_t a : axes) { 31 | if (a >= shape.size()) 32 | throw std::runtime_error("Axis must be less than dimension of shape."); 33 | } 34 | 35 | size_t total_size = shape[0] * shape[1] * shape[2] * shape[3]; 36 | 37 | std::vector steps = { 38 | shape[axes[1]] * shape[axes[2]] * shape[axes[3]], 39 | shape[axes[2]] * shape[axes[3]], shape[axes[3]], 40 | 1 41 | }; 42 | 43 | size_t source_data_idx = 0; 44 | std::vector new_data(total_size, 0); 45 | 46 | std::vector ids(shape.size()); 47 | for (ids[0] = 0; ids[0] < shape[0]; ids[0]++) { 48 | for (ids[1] = 0; ids[1] < shape[1]; ids[1]++) { 49 | for (ids[2] = 0; ids[2] < shape[2]; ids[2]++) { 50 | for (ids[3]= 0; ids[3] < shape[3]; ids[3]++) { 51 | size_t new_data_idx = ids[axes[0]] * steps[0] + ids[axes[1]] * steps[1] + 52 | ids[axes[2]] * steps[2] + ids[axes[3]] * steps[3]; 53 | new_data[new_data_idx] = data[source_data_idx++]; 54 | } 55 | } 56 | } 57 | } 58 | return new_data; 59 | } 60 | 61 | std::vector sliceAndGetSecondChannel(const std::vector& data) { 62 | std::vector new_data(data.size() / 2, 0); 63 | for (size_t i = 0; i < data.size() / 2; i++) { 64 | new_data[i] = data[i * 2 + 1]; 65 | } 66 | return new_data; 67 | } 68 | 69 | std::vector maskToBoxes(const cv::Mat& mask, float min_area, float min_height, const cv::Size& image_size) 70 | { 71 | std::vector bboxes; 72 | double min_val; 73 | double max_val; 74 | cv::minMaxLoc(mask, &min_val, &max_val); 75 | int max_bbox_idx = static_cast(max_val); 76 | cv::Mat resized_mask; 77 | cv::resize(mask, resized_mask, image_size, 0, 0, cv::INTER_NEAREST); 78 | 79 | for (int i = 1; i <= max_bbox_idx; i++) { 80 | cv::Mat bbox_mask = resized_mask == i; 81 | std::vector> contours; 82 | 83 | cv::findContours(bbox_mask, contours, cv::RETR_CCOMP, cv::CHAIN_APPROX_SIMPLE); 84 | if (contours.empty()) 85 | continue; 86 | cv::RotatedRect r = cv::minAreaRect(contours[0]); 87 | if (std::min(r.size.width, r.size.height) < min_height) 88 | continue; 89 | if (r.size.area() < min_area) 90 | continue; 91 | bboxes.emplace_back(r); 92 | } 93 | return bboxes; 94 | } 95 | 96 | std::vector coordToBoxes( 97 | const float* coords, size_t coords_size, float min_area, float min_height, 98 | const cv::Size& input_shape, const cv::Size& image_size) 99 | { 100 | std::vector bboxes; 101 | int num_boxes = coords_size / 5; 102 | float x_scale = image_size.width / float(input_shape.width); 103 | float y_scale = image_size.height / float(input_shape.height); 104 | 105 | for (int i = 0; i < num_boxes; i++) { 106 | const float* prediction = &coords[i * 5]; 107 | float confidence = prediction[4]; 108 | if (confidence < std::numeric_limits::epsilon()) 109 | break; 110 | 111 | // predictions are sorted the way that all insignificant boxes are 112 | // grouped together 113 | cv::Point2f center = cv::Point2f((prediction[0] + prediction[2]) / 2 * x_scale, 114 | (prediction[1] + prediction[3]) / 2 * y_scale); 115 | cv::Size2f size = cv::Size2f((prediction[2] - prediction[0]) * x_scale, 116 | (prediction[3] - prediction[1]) * y_scale); 117 | cv::RotatedRect rect = cv::RotatedRect(center, size, 0); 118 | 119 | if (rect.size.area() < min_area) 120 | continue; 121 | 122 | bboxes.push_back(rect); 123 | } 124 | 125 | return bboxes; 126 | } 127 | 128 | int findRoot(int point, std::unordered_map* group_mask) { 129 | int root = point; 130 | bool update_parent = false; 131 | while (group_mask->at(root) != -1) { 132 | root = group_mask->at(root); 133 | update_parent = true; 134 | } 135 | if (update_parent) { 136 | (*group_mask)[point] = root; 137 | } 138 | return root; 139 | } 140 | 141 | void join(int p1, int p2, std::unordered_map* group_mask) { 142 | int root1 = findRoot(p1, group_mask); 143 | int root2 = findRoot(p2, group_mask); 144 | if (root1 != root2) { 145 | (*group_mask)[root1] = root2; 146 | } 147 | } 148 | 149 | cv::Mat get_all(const std::vector& points, int w, int h, std::unordered_map* group_mask) 150 | { 151 | std::unordered_map root_map; 152 | 153 | cv::Mat mask(h, w, CV_32S, cv::Scalar(0)); 154 | for (const auto& point : points) { 155 | int point_root = findRoot(point.x + point.y * w, group_mask); 156 | if (root_map.find(point_root) == root_map.end()) { 157 | root_map.emplace(point_root, static_cast(root_map.size() + 1)); 158 | } 159 | mask.at(point.x + point.y * w) = root_map[point_root]; 160 | if(root_map[point_root]>50000) { 161 | std::cout << root_map[point_root] << std::endl; 162 | } 163 | } 164 | 165 | return mask; 166 | } 167 | 168 | cv::Mat decodeImageByJoin( 169 | const std::vector& cls_data, const std::vector& cls_data_shape, 170 | const std::vector& link_data, const std::vector& link_data_shape, 171 | float cls_conf_threshold, float link_conf_threshold) { 172 | int h = cls_data_shape[ov::layout::height_idx({"NHWC"})]; 173 | int w = cls_data_shape[ov::layout::width_idx({"NHWC"})]; 174 | 175 | std::vector pixel_mask(h * w, 0); 176 | std::unordered_map group_mask; 177 | std::vector points; 178 | for (size_t i = 0; i < pixel_mask.size(); i++) { 179 | pixel_mask[i] = cls_data[i] >= cls_conf_threshold; 180 | if (pixel_mask[i]) { 181 | points.emplace_back(i % w, i / w); 182 | group_mask[i] = -1; 183 | } 184 | } 185 | 186 | std::vector link_mask(link_data.size(), 0); 187 | for (size_t i = 0; i < link_mask.size(); i++) { 188 | link_mask[i] = link_data[i] >= link_conf_threshold; 189 | } 190 | 191 | size_t neighbours = size_t(link_data_shape[ov::layout::channels_idx({"NHWC"})]); 192 | for (const auto& point : points) { 193 | size_t neighbour = 0; 194 | for (int ny = point.y - 1; ny <= point.y + 1; ny++) { 195 | for (int nx = point.x - 1; nx <= point.x + 1; nx++) { 196 | if (nx == point.x && ny == point.y) 197 | continue; 198 | 199 | if (nx >= 0 && nx < w && ny >= 0 && ny < h) { 200 | uchar pixel_value = pixel_mask[size_t(ny) * size_t(w) + size_t(nx)]; 201 | uchar link_value = link_mask[ 202 | (size_t(point.y) * size_t(w) + size_t(point.x)) * neighbours + neighbour]; 203 | if (pixel_value && link_value) { 204 | join(point.x + point.y * w, nx + ny * w, &group_mask); 205 | } 206 | } 207 | neighbour++; 208 | } 209 | } 210 | } 211 | 212 | return get_all(points, w, h, &group_mask); 213 | } 214 | 215 | // text-detection-0003 216 | // input = BHWC, 1,768,1280,3 217 | // output = link - model/link_logits_/add, 1,192,320,16 218 | // segm - model/segm_logits/add, 1,192,130,2 219 | 220 | std::vector postProcess( 221 | const float *link_data_pointer, const std::vector &link_shape, float link_conf_threshold, 222 | const float *cls_data_pointer, const std::vector &cls_shape, float cls_conf_threshold, 223 | const int input_w, const int input_h) 224 | { 225 | const int kMinArea = 300; 226 | const int kMinHeight = 10; 227 | cv::Size image_size(input_w, input_h); 228 | 229 | std::vector rects; 230 | 231 | // PostProcessing for PixelLink Text Detection model 232 | size_t link_data_size = link_shape[0] * link_shape[1] * link_shape[2] * link_shape[3]; 233 | std::vector link_data(link_data_pointer, link_data_pointer + link_data_size);; 234 | softmax(&link_data); 235 | link_data = sliceAndGetSecondChannel(link_data); 236 | std::vector new_link_data_shape {link_shape[0], link_shape[1], link_shape[2], link_shape[3]/2}; 237 | 238 | size_t cls_data_size = cls_shape[0] * cls_shape[1] * cls_shape[2] * cls_shape[3]; 239 | std::vector cls_data(cls_data_pointer, cls_data_pointer + cls_data_size); 240 | softmax(&cls_data); 241 | cls_data = sliceAndGetSecondChannel(cls_data); 242 | std::vector new_cls_data_shape {cls_shape[0], cls_shape[1], cls_shape[2], cls_shape[3]/2}; 243 | 244 | cv::Mat mask = decodeImageByJoin( 245 | cls_data, new_cls_data_shape, link_data, new_link_data_shape, cls_conf_threshold, link_conf_threshold); 246 | 247 | rects = maskToBoxes( 248 | mask, static_cast(kMinArea), static_cast(kMinHeight), image_size); 249 | 250 | return rects; 251 | } 252 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /handwritten-japanese-ocr-touch-panel-demo-cpp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Handwritten Japanese OCR demo program 3 | Based on a sample program from OpenVINO 2020.2 (handwritten-japanese-recognition-demo.py) 4 | """ 5 | 6 | """ 7 | Copyright (c) 2020 Intel Corporation 8 | Licensed under the Apache License, Version 2.0 (the "License"); 9 | you may not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | 19 | import os 20 | import sys 21 | import time 22 | import math 23 | import logging as log 24 | from argparse import ArgumentParser, SUPPRESS 25 | 26 | import cv2 27 | import numpy as np 28 | from functools import reduce 29 | 30 | from PIL import ImageFont, ImageDraw, Image 31 | 32 | from text_detection_postprocess import postprocess 33 | 34 | from openvino.preprocess import PrePostProcessor, ResizeAlgorithm 35 | from openvino.runtime import AsyncInferQueue, Core, InferRequest, Layout, Type 36 | from utils.codec import CTCCodec 37 | 38 | # Canvas size is the same as the input size of the text detection model (to ommit resizing before text area inference) 39 | _canvas_x = 1280 40 | _canvas_y = 768 41 | 42 | 43 | # ----------------------------------------------------------------- 44 | 45 | def get_characters(char_file): 46 | with open(char_file, 'r', encoding='utf-8') as f: 47 | return ''.join(line.strip('\n') for line in f) 48 | 49 | 50 | def preprocess_input(src, height, width): 51 | src = cv2.cvtColor(src, cv2.COLOR_BGR2GRAY) 52 | ratio = float(src.shape[1]) / float(src.shape[0]) 53 | tw = int(height * ratio) 54 | 55 | rsz = cv2.resize(src, (tw, height), interpolation=cv2.INTER_CUBIC).astype(np.float32) 56 | outimg = np.full((height, width), 255., np.float32) 57 | rsz_h, rsz_w = rsz.shape 58 | outimg[:rsz_h, :rsz_w] = rsz 59 | cv2.imshow('OCR input image', outimg) 60 | 61 | outimg = np.reshape(outimg, (1, height, width)) 62 | return outimg 63 | 64 | # ----------------------------------------------------------------- 65 | 66 | def topLeftPoint(points): 67 | big_number = 1e10 68 | _X=0 69 | _Y=1 70 | most_left = [big_number, big_number] 71 | almost_most_left = [big_number, big_number] 72 | most_left_idx = -1 73 | almost_most_left_idx = -1 74 | 75 | for i, point in enumerate(points): 76 | px, py = point 77 | if most_left[_X]>px: 78 | if most_left[_X] px and [px,py]!=most_left: 84 | almost_most_left = [px,py] 85 | almost_most_left_idx = i 86 | if almost_most_left[_Y]segm_conf_thresh: 112 | pixel_mask[i] = True 113 | points.append((i%w, i//w)) 114 | group_mask[i] = -1 115 | else: 116 | pixel_mask[i] = False 117 | 118 | link_mask = np.array([ ld>=link_conf_thresh for ld in link_data ]) 119 | 120 | neighbours = int(link_data_shape[3]) 121 | for px, py in points: 122 | neighbor = 0 123 | for ny in range(py-1, py+1+1): 124 | for nx in range(px-1, px+1+1): 125 | if nx==px and ny==py: 126 | continue 127 | if nx<0 or nx>=w or ny<0 or ny>=h: 128 | continue 129 | pixel_value = pixel_mask[ny*w + nx] 130 | link_value = link_mask [py*w + px*neighbours + neighbor ] 131 | if pixel_value and link_value: 132 | join(px+py*w, nx+ny*w, group_mask) 133 | neighbor+=1 134 | return get_all(points, w, h, group_mask) 135 | 136 | 137 | def maskToBoxes(mask, min_area, min_height, image_size): 138 | _X=0 139 | _Y=1 140 | bboxes = [] 141 | min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(mask) 142 | max_bbox_idx = int(max_val) 143 | resized_mask = cv2.resize(mask, image_size, interpolation=cv2.INTER_NEAREST) 144 | 145 | for i in range(1, max_bbox_idx+1): 146 | bbox_mask = np.where(resized_mask==i, 255, 0).astype(np.uint8) 147 | contours, hierarchy = cv2.findContours(bbox_mask, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) 148 | if len(contours)==0: 149 | continue 150 | center, size, angle = cv2.minAreaRect(contours[0]) 151 | if min(size[_X], size[_Y]) < min_height: 152 | continue 153 | if size[_X]*size[_Y] < min_area: 154 | continue 155 | bboxes.append((center, size, angle)) 156 | return bboxes 157 | 158 | 159 | def text_detection_postprocess(link, segm, image_size, segm_conf_thresh, link_conf_thresh): 160 | _N = 0 161 | _C = 1 162 | _H = 2 163 | _W = 3 164 | kMinArea = 300 165 | kMinHeight = 10 166 | 167 | link_shape = link.shape 168 | link_data_size = reduce(lambda a, b: a*b, link_shape) 169 | link_data = link.transpose((_N, _H, _W, _C)) 170 | link_data = link_data.flatten() 171 | link_data = softmax_channel(link_data) 172 | link_data = link_data.reshape((-1,2))[:,1] 173 | new_link_data_shape = [ link_shape[0], link_shape[2], link_shape[3], link_shape[1]/2 ] 174 | 175 | segm_shape = segm.shape 176 | segm_data_size = reduce(lambda a, b: a*b, segm_shape) 177 | segm_data = segm.transpose((_N, _H, _W, _C)) 178 | segm_data = segm_data.flatten() 179 | segm_data = softmax_channel(segm_data) 180 | segm_data = segm_data.reshape((-1,2))[:,1] 181 | new_segm_data_shape = [ segm_shape[0], segm_shape[2], segm_shape[3], segm_shape[1]/2 ] 182 | 183 | mask = decodeImageByJoin(segm_data, new_segm_data_shape, link_data, new_link_data_shape, 184 | segm_conf_thresh, link_conf_thresh) 185 | rects = maskToBoxes(mask, kMinArea, kMinHeight, image_size) 186 | 187 | return rects 188 | 189 | 190 | 191 | # ---------------------------------------------------------------------------- 192 | 193 | def topLeftPoint(points): 194 | big_number = 1e10 195 | _X=0 196 | _Y=1 197 | most_left = [big_number, big_number] 198 | almost_most_left = [big_number, big_number] 199 | most_left_idx = -1 200 | almost_most_left_idx = -1 201 | 202 | for i, point in enumerate(points): 203 | px, py = point 204 | if most_left[_X]>px: 205 | if most_left[_X] px and [px,py]!=most_left: 211 | almost_most_left = [px,py] 212 | almost_most_left_idx = i 213 | if almost_most_left[_Y]