├── .gitignore ├── CMakeLists.txt ├── README.md ├── cmake └── Modules │ ├── Eigen.cmake │ ├── Eigen_VERSION.cmake │ └── FindTensorFlow.cmake ├── ctc_scene_text_recognizer.cpp ├── ctc_scene_text_recognizer.h ├── detector.h ├── faster_rcnn_text_detector.cpp ├── faster_rcnn_text_detector.h ├── images ├── 1.jpg ├── 2.jpg ├── 3.jpg ├── 4.jpg ├── 5.jpg └── pipeline.jpg ├── main.cpp ├── recognizer.h ├── scene_text_reader.cpp ├── scene_text_reader.h ├── test_images ├── img_108.jpg └── word_2.png ├── text_box.cpp ├── text_box.h ├── utils.cpp └── utils.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # 35 | build/ 36 | external/ 37 | model/ 38 | bin/ 39 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(SceneTextDetection) 3 | set (CMAKE_CXX_STANDARD 11) 4 | 5 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin) 6 | 7 | if(NOT CMAKE_BUILD_TYPE) 8 | set(CMAKE_BUILD_TYPE Release) 9 | endif() 10 | 11 | set(CMAKE_CXX_FLAGS "-Wall -Wextra") 12 | set(CMAKE_CXX_FLAGS_DEBUG "-g") 13 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 14 | 15 | set(SOURCE_FILES main.cpp faster_rcnn_text_detector.cpp ctc_scene_text_recognizer.cpp utils.cpp scene_text_reader.cpp text_box.cpp) 16 | set(EXECUTABLE DetectText) 17 | #set(EXECUTABLE EndtoEndReading) 18 | #set(EXECUTABLE RecognizeText) 19 | 20 | # Add modules 21 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules") 22 | 23 | find_package(TensorFlow REQUIRED) 24 | find_package(Protobuf REQUIRED) 25 | find_package(OpenCV REQUIRED highgui imgproc) 26 | 27 | #set(TensorFlow_LIBRARIES "/usr/local/lib/libtensorflow_cc.so" "/usr/local/lib/libtensorflow_framework.so") 28 | #set(TensorFlow_LIBRARIES "/usr/local/lib/libtensorflow_cc.so" "/usr/local/lib/libtensorflow.so") 29 | set(TensorFlow_LIBRARIES "/usr/local/lib/libtensorflow.so" "/usr/local/lib/libtensorflow_cc.so" "/usr/local/lib/libtensorflow_framework.so") 30 | 31 | message(STATUS "protobuf include dirs:" ${PROTOBUF_INCLUDE_DIRS}) 32 | message(STATUS "protobuf libraries:" ${PROTOBUF_LIBRARIES}) 33 | message(STATUS "tensorflow include dirs:" ${TensorFlow_INCLUDE_DIRS}) 34 | message(STATUS "tensorflow libraries:" ${TensorFlow_LIBRARIES}) 35 | 36 | # set variables for external dependencies 37 | set(EXTERNAL_DIR "${PROJECT_SOURCE_DIR}/external" CACHE PATH "Location where external dependencies will installed") 38 | set(DOWNLOAD_LOCATION "${EXTERNAL_DIR}/src" CACHE PATH "Location where external projects will be downloaded") 39 | 40 | #change the path to ur tesnorflow installed path 41 | #set(NSYNC_DIR "/usr/local/lib/python2.7/dist-packages/tensorflow/include/external/nsync/public") 42 | 43 | mark_as_advanced(EXTERNAL_DIR DOWNLOAD_LOCATION) 44 | 45 | include(Eigen) 46 | 47 | set(PROJECT_INCLUDE_DIRS ${TensorFlow_INCLUDE_DIRS} ${EXTERNAL_DIR}/include ${PROTOBUF_INCLUDE_DIRS} ${NSYNC_DIR} ${OpenCV_INCLUDE_DIRS}) 48 | set(PROJECT_LIBRARIES ${TensorFlow_LIBRARIES} ${PROTOBUF_LIBRARIES} ${OpenCV_LIBS}) 49 | set(PROJECT_DEPENDENCIES Eigen) 50 | 51 | include_directories(${PROJECT_INCLUDE_DIRS}) 52 | add_library(SceneTextDetector faster_rcnn_text_detector.cpp utils.cpp) 53 | add_library(CTCSceneTextRecognizer ctc_scene_text_recognizer.cpp utils.cpp) 54 | add_library(SceneTextReader scene_text_reader.cpp faster_rcnn_text_detector.cpp ctc_scene_text_recognizer.cpp utils.cpp text_box.cpp) 55 | add_executable(${EXECUTABLE} ${SOURCE_FILES}) 56 | target_link_libraries(${EXECUTABLE} ${PROJECT_LIBRARIES}) 57 | add_dependencies(${EXECUTABLE} ${PROJECT_DEPENDENCIES}) 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepSceneTextReader 2 | This is a c++ project deploying a deep scene text reading pipeline. It reads text from natural scene images. 3 | 4 |

5 | 6 | 7 | 8 | 9 |

10 | 11 | 12 | # Prerequsites 13 | 14 | The project is written in c++ using tensorflow computational framework. It is tested using tensorflow 1.4. Newer version should be ok too, but not tested. 15 | Please install: 16 | 17 | * Tensorflow 18 | 19 | * nsync project: https://github.com/google/nsync.git This is needed for building tensorflow. 20 | 21 | * opencv3.3 22 | 23 | * protobuf 24 | 25 | * eigen 26 | 27 | Please check this project on how to build project using tensorflow with cmake: 28 | https://github.com/cjweeks/tensorflow-cmake 29 | It greatly helped the progress of building this project. 30 | When building tensorflow library, please be careful since we need to use opencv. Looks like there is still problem when including tensorflow and opencv together. 31 | It will make opencv unable to read image. 32 | Check out this issue: https://github.com/tensorflow/tensorflow/issues/14267 33 | The answer by allenlavoie solved my problem, so I paste it here: 34 | 35 | "In the meantime, as long as you're not using any custom ops you can build libtensorflow_cc.so with bazel build --config=monolithic, which will condense everything together into one shared object (no libtensorflow_framework dependence) and seal off non-TensorFlow symbols. That shared object will have protocol buffer symbols." 36 | 37 | # Status 38 | Currently two pretrained model is provided. One for scene text detection, and one for scene text recognition. 39 | More model will be provided. 40 | Note that the current model is not so robust. U can easily change to ur trained model. 41 | The models will be continuously updated. 42 | 43 | # build process 44 | 45 | cd build 46 | 47 | cmake .. 48 | 49 | make 50 | 51 | It will create an excutable named **DetectText** in bin folder. 52 | 53 | # Usage: 54 | The excutable could be excuted in three modes: (1) Detect (2) Recognize (3) Detect and Recognize 55 | 56 | ## Detect 57 | Download the pretrained detector model and put it in model/ 58 | 59 | ./DetectText --detector_graph='model/Detector_model.pb' \ 60 | --image_filename='test_images/test_img1.jpg' --mode='detect' --output_filename='results/output_image.jpg' 61 | 62 | ## Recognize 63 | Download the pretrained recognizer model and put it in model/ 64 | Download the dictionary file and put it in model 65 | 66 | 67 | ./DetectText --recognizer_graph='model/Recognizer_model.pb' \ 68 | --image_filename='test_images/recognize_image1.jpg' --mode='recognize' \ 69 | --im_height=32 --im_width=128 70 | 71 | ## Detect and Recognize 72 | Download the pretrained detector and recognizer model and put it in model/ as described previously. 73 | 74 | ./DetectText --recognizer_graph=$recognizer_graph --detector_graph='model/Detector_model.pb' \ 75 | --image_filename='model/Recognizer_model.pb' --mode='detect_and_read' --output_filename='results/output_image.jpg' 76 | 77 | # Model Description 78 | ### *Detector* 79 | 1. Faster RCNN Detector Model 80 | The detector is trained with modified tensorflow [object detector api]: (https://github.com/tensorflow/models/tree/master/research/object_detection) 81 | I modify it by changing the proposal scheme to regress to the 4 coordinates of the oriented bounding box rather than regular rectangular bounding box. 82 | Check out this [repo](https://github.com/dafanghe/Tensorflow_SceneText_Oriented_Box_Predictor) for the training code. 83 | Pretrained model: FasterRCNN_detector_model.pb 84 | 85 | 2. R2CNN will be updated. See [R2CNN](https://arxiv.org/abs/1706.09579) for details. 86 | The code is also modified with tnesorflow [object detector api]: (https://github.com/tensorflow/models/tree/master/research/object_detection) 87 | The training code will be released soon. 88 | 89 | 90 | ### *Recognizer* 91 | 1. CTC scene text recognizer. 92 | The recognizer model follows the famous scene text recognition [CRNN model](https://arxiv.org/abs/1507.05717) 93 | 94 | 2. Spatial Attention OCR will be updated soon. It is based on [GoogleOCR](https://github.com/tensorflow/models/tree/master/research/attention_ocr) 95 | 96 | ### *Detect and Recognize* 97 | The whole scene text reading pipeline detects the text and rotate it horizontally and read it with recognizer. 98 | The pipeline is here: 99 | 100 |

101 | 102 |

103 | 104 | ### *Pretrained Models* 105 | You can play with the code with provided pretrained models. \ 106 | They are not fully optimized yet, but could be used for being familiar with the code. \ 107 | Check them out here: [models](https://drive.google.com/drive/folders/1Ao0ZrSVf0YjU6pnzGY0C3QJ2Qz0ljRIU?usp=sharing) 108 | 109 | You will find two detection models called: (1) **FasterRCNN_detector_model.pb** (2) **R2CNN_detector_model.pb** \ 110 | Two recognition models with their charset: (1) **Recognizer_model.pb + charset_full.txt** and (2)**Recognizer_model_case_insen.pb + charset_case_insen.txt**. \ 111 | Full charset means English letters + digit and case insen means case insensitive English letters + digit. 112 | Let me know if u have any problens using them. 113 | 114 | 115 | # Reference and Related Projects 116 | - [Faster RCNN](https://arxiv.org/abs/1506.01497) Faster RCNN paper. 117 | - [Tensorflow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection). 118 | - [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/abs/1507.05717), reference paper for CRNN model. 119 | - [tensorflow-cmake](https://github.com/cjweeks/tensorflow-cmake), Tutorial of Building Project with tensorflow using cmake. 120 | - [R2CNN](https://arxiv.org/abs/1706.09579) Reference paper for R2CNN. 121 | 122 | # Contact: 123 | 124 | * Dafang He. The Penn State University. hdfcraig@gmail.com http://personal.psu.edu/duh188/ 125 | -------------------------------------------------------------------------------- /cmake/Modules/Eigen.cmake: -------------------------------------------------------------------------------- 1 | include(ExternalProject) 2 | include(Eigen_VERSION) 3 | 4 | set(Eigen_INSTALL ${EXTERNAL_DIR}/include/eigen/${Eigen_DIR}) 5 | 6 | set(Eigen_INCLUDE_DIRS 7 | ${PROJECT_SOURCE_DIR}/external/include/eigen 8 | ${Eigen_INSTALL}) 9 | 10 | ExternalProject_Add(Eigen 11 | PREFIX ${PROJECT_SOURCE_DIR}/external/src/eigen 12 | URL ${Eigen_URL} 13 | URL_HASH ${Eigen_HASH} 14 | DOWNLOAD_DIR ${DOWNLOAD_LOCATION} 15 | 16 | CMAKE_ARGS 17 | -DCMAKE_BUILD_TYPE:STRING=Release 18 | -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF 19 | -DCMAKE_INSTALL_PREFIX:STRING=${Eigen_INSTALL} 20 | -DINCLUDE_INSTALL_DIR:STRING=${Eigen_INSTALL}) 21 | 22 | include_directories(${Eigen_INCLUDE_DIRS}) 23 | -------------------------------------------------------------------------------- /cmake/Modules/Eigen_VERSION.cmake: -------------------------------------------------------------------------------- 1 | #set(Eigen_URL https://bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz,http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz) 2 | set(Eigen_URL http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz) 3 | set(Eigen_ARCHIVE_HASH 429aa5254200) 4 | set(Eigen_HASH SHA256=61d8b6fc4279dd1dda986fb1677d15e3d641c07a3ea5abe255790b1f0c0c14e9) 5 | set(Eigen_DIR eigen3) 6 | set(Eigen_INSTALL_DIR /usr/local) 7 | -------------------------------------------------------------------------------- /cmake/Modules/FindTensorFlow.cmake: -------------------------------------------------------------------------------- 1 | # Locates the tensorFlow library and include directories. 2 | 3 | include(FindPackageHandleStandardArgs) 4 | unset(TENSORFLOW_FOUND) 5 | 6 | find_path(TensorFlow_INCLUDE_DIR 7 | NAMES 8 | tensorflow/core 9 | tensorflow/cc 10 | third_party 11 | HINTS 12 | /usr/local/include/google/tensorflow 13 | /usr/include/google/tensorflow) 14 | 15 | find_library(TensorFlow_LIBRARY 16 | NAMES 17 | tensorflow_cc 18 | HINTS 19 | /usr/lib 20 | /usr/local/lib) 21 | 22 | #tensorflow_all 23 | # set TensorFlow_FOUND 24 | find_package_handle_standard_args(TensorFlow DEFAULT_MSG TensorFlow_INCLUDE_DIR TensorFlow_LIBRARY) 25 | 26 | # set external variables for usage in CMakeLists.txt 27 | if(TENSORFLOW_FOUND) 28 | set(TensorFlow_LIBRARIES ${TensorFlow_LIBRARY}) 29 | set(TensorFlow_INCLUDE_DIRS ${TensorFlow_INCLUDE_DIR}) 30 | endif() 31 | 32 | # hide locals from GUI 33 | mark_as_advanced(TensorFlow_INCLUDE_DIR TensorFlow_LIBRARY) 34 | -------------------------------------------------------------------------------- /ctc_scene_text_recognizer.cpp: -------------------------------------------------------------------------------- 1 | #include "ctc_scene_text_recognizer.h" 2 | 3 | 4 | CTCSceneTextRecognizer::CTCSceneTextRecognizer(){ 5 | init_constant_vars(); 6 | } 7 | 8 | 9 | CTCSceneTextRecognizer::CTCSceneTextRecognizer(std::string frozen_graph_filename, std::string dictionary_filename, int _im_height, int _im_width):Recognizer(frozen_graph_filename, dictionary_filename){ 10 | init_constant_vars(_im_height, _im_width); 11 | } 12 | 13 | 14 | bool CTCSceneTextRecognizer::init(const std::string frozen_graph_filename, const std::string dictionary_filename){ 15 | this->init_graph(frozen_graph_filename); 16 | this->init_dictionary(dictionary_filename); 17 | return true; 18 | } 19 | 20 | 21 | void CTCSceneTextRecognizer::init_constant_vars(int _im_height, int _im_width){ 22 | std::string input_layer_string = "input_images:0,input_seq_lens:0"; 23 | std::string output_layer_string = "CTCBeamSearchDecoder:0,CTCBeamSearchDecoder:1,CTCBeamSearchDecoder:2"; 24 | this->input_layers = str_util::Split(input_layer_string, ','); 25 | this->output_layers = str_util::Split(output_layer_string, ','); 26 | this->seq_len = 29; 27 | this->image_width = _im_width; //input image width; 28 | this->image_height = _im_height; //input image height 29 | this->width_scale_ratio = 1.2; //scale the width for better recognition 30 | } 31 | 32 | 33 | void CTCSceneTextRecognizer::preprocess_image(cv::Mat& input_image, cv::Mat& output_image){ 34 | cv::Mat resized_image, padded_image; 35 | int new_width = int(this->width_scale_ratio * input_image.cols); 36 | cv::resize(input_image, input_image, cv::Size(new_width, input_image.rows)); 37 | float ratio=0; 38 | resize_image_fix_height(input_image, resized_image, ratio, this->image_height); 39 | pad_image_width(resized_image, output_image, this->image_width); 40 | } 41 | 42 | 43 | string CTCSceneTextRecognizer::run_graph(const cv::Mat& image){ 44 | int height = image.rows; 45 | int width = image.cols; 46 | Tensor input_img_tensor(DT_FLOAT, TensorShape({1, height, width, 3})); 47 | 48 | unsigned char *input_data = (unsigned char*)(image.data); 49 | auto input_tensor_mapped = input_img_tensor.tensor(); 50 | //(TODO) is there any other ways to copy the data into tensor? 51 | for (int y = 0;y < height; ++y) { 52 | for (int x = 0;x < width; ++x) { 53 | unsigned char b = input_data[image.step * y + x * image.channels()]; 54 | unsigned char g = input_data[image.step * y + x * image.channels() + 1]; 55 | unsigned char r = input_data[image.step * y + x * image.channels() + 2]; 56 | input_tensor_mapped(0, y, x, 0) = float(r); 57 | input_tensor_mapped(0, y, x, 1) = float(g); 58 | input_tensor_mapped(0, y, x, 2) = float(b); 59 | } 60 | } 61 | //create the seq len tensor and assign fixed value 62 | Tensor input_seq_len_tensor(DT_INT32, TensorShape({1})); 63 | auto input_seq_len_mapped = input_seq_len_tensor.tensor(); 64 | input_seq_len_mapped(0) = this->seq_len; 65 | 66 | //create the input to run 67 | std::vector > inputs = { 68 | {this->input_layers[0], input_img_tensor}, 69 | {this->input_layers[1], input_seq_len_tensor}, 70 | }; 71 | 72 | std::vector outputs; 73 | Status run_status = this->session->Run(inputs, 74 | this->output_layers, {}, &outputs); 75 | if (!run_status.ok()) { 76 | LOG(ERROR) << "Running model failed: " << run_status; 77 | return ""; 78 | } 79 | LOG(INFO) <<"number of output:"<(); 82 | auto values = outputs[1].flat_outer_dims(); 83 | 84 | const Eigen::Tensor::Dimensions& indices_dim = indices.dimensions(); 85 | const Eigen::Tensor::Dimensions& values_dim = values.dimensions(); 86 | 87 | LOG(INFO) << outputs[0].DebugString(); 88 | LOG(INFO) << outputs[1].DebugString(); 89 | std::vector encoded_text; 90 | for(int i=0; i CTCSceneTextRecognizer::preprocess_images(std::vector& input_images){ 101 | std::vector processed_images(input_images.size()); 102 | for(int i=0; ipreprocess_image(input_images[i], preprocessed_image); 105 | processed_images[i] = preprocessed_image; 106 | } 107 | return processed_images; 108 | } 109 | 110 | 111 | std::vector CTCSceneTextRecognizer::run_graph(const std::vector images){ 112 | //the images must be preprocessd and has the same height and width!! 113 | std::vector res; 114 | int num_word = images.size(); 115 | if(num_word == 0) return res; 116 | 117 | int height = this->image_height; 118 | int width = this->image_width; 119 | Tensor input_img_tensor(DT_FLOAT, TensorShape({num_word, height, width, 3})); 120 | auto input_tensor_mapped = input_img_tensor.tensor(); 121 | //create the seq len tensor and assign fixed value for ctc 122 | Tensor input_seq_len_tensor(DT_INT32, TensorShape({num_word})); 123 | auto input_seq_len_mapped = input_seq_len_tensor.tensor(); 124 | 125 | for(int i=0; iseq_len; 143 | } 144 | //create the input to run 145 | std::vector > inputs = { 146 | {this->input_layers[0], input_img_tensor}, 147 | {this->input_layers[1], input_seq_len_tensor}, 148 | }; 149 | 150 | //std::cout<<"run recognition graph"< outputs; 152 | Status run_status = this->session->Run(inputs, 153 | this->output_layers, {}, &outputs); 154 | if (!run_status.ok()) { 155 | LOG(ERROR) << "Running model failed: " << run_status; 156 | return res; 157 | } 158 | LOG(INFO) <<"number of output:"<(); 165 | auto values = outputs[1].tensor(); 166 | 167 | //const Eigen::Tensor::Dimensions& indices_dim = indices.dimensions(); 168 | //const Eigen::Tensor::Dimensions& values_dim = values.dimensions(); 169 | 170 | std::vector > encoded_texts(num_word); 171 | for(int i=0; i 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "tensorflow/core/framework/tensor_shape.pb.h" 11 | #include "tensorflow/core/framework/tensor.h" 12 | #include "tensorflow/core/graph/graph.h" 13 | #include "tensorflow/core/lib/strings/str_util.h" 14 | #include "tensorflow/core/platform/logging.h" 15 | #include "tensorflow/core/platform/platform.h" 16 | #include "tensorflow/core/platform/types.h" 17 | 18 | //opencv 19 | #include 20 | #include "opencv2/opencv.hpp" 21 | 22 | #include "utils.h" 23 | 24 | #include "recognizer.h" 25 | 26 | using namespace tensorflow; 27 | 28 | 29 | class CTCSceneTextRecognizer: public Recognizer{ 30 | public: 31 | CTCSceneTextRecognizer(); 32 | 33 | CTCSceneTextRecognizer(const std::string frozen_graph_filename, const std::string dictionary_filename, 34 | int _im_height=32, int _im_width=128); 35 | 36 | bool init(const std::string frozen_graph_filename, const std::string); 37 | void preprocess_image(cv::Mat& input_image, cv::Mat& output_image); 38 | std::vector preprocess_images(std::vector& input_images); 39 | std::string run_graph(const cv::Mat& image); 40 | std::vector run_graph(const std::vector input_images); 41 | 42 | private: 43 | void init_constant_vars(int _im_height=32, int _im_width=128); 44 | float width_scale_ratio; 45 | int seq_len; 46 | int image_width; 47 | int image_height; 48 | }; 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /detector.h: -------------------------------------------------------------------------------- 1 | #ifndef Detector_H 2 | #define Detector_H 3 | 4 | #include 5 | #include 6 | 7 | //opencv 8 | #include 9 | #include "opencv2/opencv.hpp" 10 | 11 | //tensorflow 12 | #include "tensorflow/core/platform/logging.h" 13 | #include "tensorflow/core/public/session.h" 14 | #include "tensorflow/core/framework/graph.pb.h" 15 | #include "tensorflow/core/graph/graph.h" 16 | #include "tensorflow/core/public/session.h" 17 | #include "tensorflow/core/lib/strings/str_util.h" 18 | 19 | #include "text_box.h" 20 | 21 | 22 | class Detector{ 23 | public: 24 | Detector(){}; 25 | Detector(const std::string frozen_graph_filename){ 26 | init_graph(frozen_graph_filename); 27 | } 28 | bool init_graph(const std::string& frozen_graph_filename){ 29 | if (!ReadBinaryProto(tensorflow::Env::Default(), frozen_graph_filename, &graph_def).ok()) { 30 | LOG(ERROR) << "error when reading proto" << frozen_graph_fliename; 31 | return -1; 32 | } 33 | 34 | tensorflow::SessionOptions sess_opt; 35 | sess_opt.config.mutable_gpu_options()->set_allow_growth(true); 36 | (&session)->reset(tensorflow::NewSession(sess_opt)); 37 | if (!session->Create(graph_def).ok()) { 38 | LOG(ERROR) << "error create graph"; 39 | return -1; 40 | } 41 | } 42 | virtual int run_graph(const cv::Mat& image, std::vector& results) = 0; 43 | 44 | tensorflow::GraphDef graph_def; 45 | std::string input_layer; //for detector, we assume there is only one input 46 | std::unique_ptr session; 47 | std::vector output_layers; 48 | }; 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /faster_rcnn_text_detector.cpp: -------------------------------------------------------------------------------- 1 | #include "faster_rcnn_text_detector.h" 2 | 3 | 4 | FasterRCNNTextDetector::FasterRCNNTextDetector(const std::string frozen_graph_filename): Detector(frozen_graph_filename) { 5 | this->init_constants(); 6 | } 7 | 8 | 9 | bool FasterRCNNTextDetector::init_constants(){ 10 | input_layer = "image_tensor:0"; 11 | output_layers = str_util::Split("detection_boxes:0,detection_scores:0,detection_classes:0,detection_oriented_boxes:0,num_detections:0", ','); 12 | score_thresh = 0.6; 13 | } 14 | 15 | 16 | int FasterRCNNTextDetector::run_graph(const cv::Mat& image, std::vector& results){ 17 | cv::Mat resized_image; 18 | float ratio_h=0, ratio_w=0; 19 | resize_image_max_len(image, resized_image, ratio_h, ratio_w); 20 | auto input_tensor = cv_mat_to_tensor(resized_image); 21 | 22 | std::vector outputs; 23 | Status run_status = this->session->Run({{this->input_layer, input_tensor}}, 24 | this->output_layers, {}, &outputs); 25 | if (!run_status.ok()) { 26 | LOG(ERROR) << "Running model failed: " << run_status; 27 | return -1; 28 | } 29 | LOG(INFO) <<"number of output:"<(); 32 | auto detection_scores = outputs[1].tensor(); 33 | auto detection_classes = outputs[2].tensor(); 34 | auto detection_oriented_boxes = outputs[3].tensor(); 35 | 36 | int num_box = detection_boxes.dimension(1); 37 | for(int i=0;i this->score_thresh){ 39 | std::vector points; 40 | for(int j=0; j<4; j++){ 41 | cv::Point p; 42 | p.x = int(detection_oriented_boxes(0, i, j, 1) * image.cols); 43 | p.y = int(detection_oriented_boxes(0, i, j, 0) * image.rows); 44 | points.push_back(p); 45 | } 46 | TextBox tb(points, ""); 47 | results.push_back(tb); 48 | } 49 | } 50 | } 51 | 52 | -------------------------------------------------------------------------------- /faster_rcnn_text_detector.h: -------------------------------------------------------------------------------- 1 | #ifndef FasterRCNN_Text_Detector_H 2 | #define FasterRCNN_Text_Detector_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | //tensorflow 10 | #include "tensorflow/core/framework/graph.pb.h" 11 | #include "tensorflow/core/framework/tensor_shape.pb.h" 12 | #include "tensorflow/core/framework/tensor.h" 13 | #include "tensorflow/core/graph/graph.h" 14 | #include "tensorflow/core/lib/strings/str_util.h" 15 | #include "tensorflow/core/platform/logging.h" 16 | #include "tensorflow/core/platform/platform.h" 17 | #include "tensorflow/core/platform/types.h" 18 | #include "tensorflow/core/public/session.h" 19 | 20 | //opencv 21 | #include 22 | #include "opencv2/opencv.hpp" 23 | 24 | #include "detector.h" 25 | #include "text_box.h" 26 | #include "utils.h" 27 | 28 | using namespace tensorflow; 29 | 30 | 31 | class FasterRCNNTextDetector: public Detector{ 32 | public: 33 | FasterRCNNTextDetector(){}; 34 | 35 | FasterRCNNTextDetector(const std::string frozen_graph_filename); 36 | 37 | bool init_constants(); 38 | int run_graph(const cv::Mat& image, std::vector& results); 39 | 40 | private: 41 | float score_thresh; 42 | }; 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /images/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/1.jpg -------------------------------------------------------------------------------- /images/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/2.jpg -------------------------------------------------------------------------------- /images/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/3.jpg -------------------------------------------------------------------------------- /images/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/4.jpg -------------------------------------------------------------------------------- /images/5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/5.jpg -------------------------------------------------------------------------------- /images/pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/pipeline.jpg -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "tensorflow/core/platform/init_main.h" 8 | #include "tensorflow/core/util/command_line_flags.h" 9 | #include "faster_rcnn_text_detector.h" 10 | #include "ctc_scene_text_recognizer.h" 11 | #include "scene_text_reader.h" 12 | #include "utils.h" 13 | 14 | int detect_text(string& detector_graph_filename, string& image_filename, string& output_filename) 15 | { 16 | LOG(INFO)<<"start text detection:"; 17 | 18 | FasterRCNNTextDetector detector(detector_graph_filename); 19 | 20 | cv::Mat image = cv::imread(image_filename); 21 | if(!image.data) // Check for invalid input 22 | { 23 | LOG(ERROR) << "Could not open or find the image " << image_filename; 24 | return -1; 25 | } 26 | std::vector colors={cv::Scalar(0,0,255), cv::Scalar(0,255,0), 27 | cv::Scalar(255,0,0), cv::Scalar(255,255,0), cv::Scalar(0,255,255), cv::Scalar(255,0,255)}; 28 | std::vector res; 29 | detector.run_graph(image, res); 30 | for(int i=0; i points = res[i].get_points(); 32 | for(int j=0; j<4; j++){ 33 | cv::line(image, points[j], points[(j+1)%4], colors[j%4], 3); 34 | } 35 | } 36 | 37 | //write image 38 | cv::imwrite(output_filename, image); 39 | return 0; 40 | } 41 | 42 | 43 | int recognize_text(string& recognizer_graph_filename, string& dictionary_filename, 44 | string& image_filename, int im_height=32, int im_width=128) 45 | { 46 | LOG(INFO) <<"start text recognition: "< res; 76 | reader.read_text(image, res); 77 | for(int i=0; i flag_list = { 96 | Flag("detector_graph", &detector_graph, "detector graph file name"), 97 | Flag("recognizer_graph", &recognizer_graph, "recognizer graph file name"), 98 | Flag("im_height", &im_height, "image height for recognition model"), 99 | Flag("im_width", &im_width, "image width for recognition model"), 100 | Flag("dictionary_filename", &dictionary_filename, "dictionary filename for decode the recognition results"), 101 | Flag("image_filename", &image_filename, "the filename to be tested."), 102 | Flag("output_filename", &output_filename, "the output filename"), 103 | Flag("mode", &mode, "the mode, must be within the three categories: detect, recognize, detect_and_read"), 104 | }; 105 | 106 | string usage = Flags::Usage(argv[0], flag_list); 107 | const bool parse_result = Flags::Parse(&argc, argv, flag_list); 108 | 109 | if (!parse_result) { 110 | LOG(ERROR) << usage; 111 | return -1; 112 | } 113 | 114 | ::tensorflow::port::InitMain(argv[0], &argc, &argv); 115 | if (argc > 1) { 116 | LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage; 117 | return -1; 118 | } 119 | 120 | if(mode == "detect"){ 121 | detect_text(detector_graph, image_filename, output_filename); 122 | }else if(mode == "recognize"){ 123 | recognize_text(recognizer_graph, dictionary_filename, image_filename, im_height, im_width); 124 | }else if(mode == "detect_and_read"){ 125 | end_to_end_reading(detector_graph, recognizer_graph, 126 | dictionary_filename, image_filename, output_filename); 127 | }else{ 128 | LOG(ERROR) << "mode should be within: detect, recognize, detect_and_read"; 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /recognizer.h: -------------------------------------------------------------------------------- 1 | #ifndef Recognizer_H 2 | #define Recognizer_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | //opencv 9 | #include 10 | #include "opencv2/opencv.hpp" 11 | 12 | //tensorflow 13 | #include "tensorflow/core/platform/logging.h" 14 | #include "tensorflow/core/public/session.h" 15 | #include "tensorflow/core/framework/graph.pb.h" 16 | #include "tensorflow/core/graph/graph.h" 17 | #include "tensorflow/core/public/session.h" 18 | #include "tensorflow/core/lib/strings/str_util.h" 19 | 20 | 21 | class Recognizer{ 22 | //A base class should implemented the following functions: 23 | //Preprocess_image: preprocess a single image represented as an opencv mat 24 | //Preprocess images: preprocess a vector of opencv mat images 25 | public: 26 | Recognizer(){}; 27 | Recognizer(const std::string& recognizer_graph_filename, const std::string& dictionary_filename){ 28 | init_dictionary(dictionary_filename); 29 | init_graph(recognizer_graph_filename); 30 | }; 31 | bool init_graph(const std::string& frozen_graph_filename){ 32 | if (!ReadBinaryProto(tensorflow::Env::Default(), frozen_graph_filename, &graph_def).ok()) { 33 | LOG(ERROR) << "Read proto"; 34 | return -1; 35 | } 36 | 37 | tensorflow::SessionOptions sess_opt; 38 | sess_opt.config.mutable_gpu_options()->set_allow_growth(true); 39 | (&session)->reset(tensorflow::NewSession(sess_opt)); 40 | if (!session->Create(graph_def).ok()) { 41 | LOG(ERROR) << "Create graph"; 42 | return -1; 43 | } 44 | } 45 | bool init_dictionary(const std::string& filename){ 46 | std::ifstream inf(filename, std::ios::in); 47 | if(!inf.is_open()) 48 | { LOG(ERROR)<<"Error dictionary opening file "< splits; 53 | while(!inf.eof()){ 54 | inf>>line; 55 | splits = tensorflow::str_util::Split(line, ','); 56 | this->mapping[std::stoi(splits[0])] = splits[1][0]; 57 | } 58 | inf.close(); 59 | return 1; 60 | } 61 | virtual void preprocess_image(cv::Mat& input_image, cv::Mat& output_image) = 0; 62 | virtual std::vector preprocess_images(std::vector& input_images) = 0; 63 | virtual std::string run_graph(const cv::Mat& image) = 0; 64 | virtual std::vector run_graph(const std::vector input_images) = 0; 65 | std::string decode_single_text(std::vector& vec){ 66 | std::string res; 67 | for(int i=0; imapping[vec[i]]); 69 | } 70 | return res; 71 | } 72 | std::unordered_map mapping; 73 | tensorflow::GraphDef graph_def; 74 | std::unique_ptr session; 75 | std::vector input_layers; 76 | std::vector output_layers; 77 | }; 78 | 79 | #endif 80 | -------------------------------------------------------------------------------- /scene_text_reader.cpp: -------------------------------------------------------------------------------- 1 | #include "scene_text_reader.h" 2 | 3 | namespace scene_text_reader{ 4 | 5 | SceneTextReader::SceneTextReader(){ 6 | 7 | } 8 | 9 | SceneTextReader::SceneTextReader(const std::string& detector_graph_filename, const string& recognizer_graph_filename, 10 | const std::string& detector_model, const std::string& dictionary_filename, const std::string& recognizer_model) 11 | { 12 | if(detector_model == "FasterRCNN"){ 13 | detector = new FasterRCNNTextDetector(detector_graph_filename); 14 | }else{ 15 | LOG(ERROR) <& boxes, std::vector& word_regions){ 25 | int num_word = boxes.size(); 26 | if(num_word == 0) return; 27 | 28 | for(int i=0; i new_points; 33 | get_cropped_extend_image(image, boxes[i], cropped, new_points); 34 | 35 | cv::Mat rotated; 36 | std::vector rotated_points; 37 | rotate_image_and_points(cropped, new_points, angle, rotated, rotated_points); 38 | word_regions.push_back(rotated); 39 | } 40 | } 41 | 42 | void SceneTextReader::read_text(cv::Mat& image, std::vector& res){ 43 | detector->run_graph(image, res); 44 | std::cout<<"found "< word_regions; 46 | extract_word_regions(image, res, word_regions); 47 | //preprocess all the images; 48 | std::vector preprocessed_images = recognizer->preprocess_images(word_regions); 49 | std::cout< output_texts = recognizer->run_graph(preprocessed_images); 51 | for(int i=0; i 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "tensorflow/core/framework/graph.pb.h" 12 | #include "tensorflow/core/framework/tensor_shape.pb.h" 13 | #include "tensorflow/core/framework/tensor.h" 14 | #include "tensorflow/core/graph/graph.h" 15 | #include "tensorflow/core/lib/strings/str_util.h" 16 | #include "tensorflow/core/platform/init_main.h" 17 | #include "tensorflow/core/platform/logging.h" 18 | #include "tensorflow/core/platform/platform.h" 19 | #include "tensorflow/core/platform/types.h" 20 | #include "tensorflow/core/public/session.h" 21 | 22 | //opencv 23 | #include 24 | #include "opencv2/opencv.hpp" 25 | 26 | #include "utils.h" 27 | //recognizer 28 | #include "ctc_scene_text_recognizer.h" 29 | #include "recognizer.h" 30 | //detector 31 | #include "faster_rcnn_text_detector.h" 32 | #include "detector.h" 33 | 34 | #include "text_box.h" 35 | 36 | using namespace tensorflow; 37 | 38 | namespace scene_text_reader{ 39 | 40 | class SceneTextReader{ 41 | public: 42 | SceneTextReader(); 43 | 44 | SceneTextReader(const std::string&, const std::string&, const std::string&, 45 | const std::string& detector_model=std::string("FasterRCNN"), 46 | const std::string& recognizer_model=std::string("CTC")); 47 | 48 | void read_text(cv::Mat&, std::vector& res); 49 | 50 | void extract_word_regions(cv::Mat& image, 51 | std::vector& boxes, std::vector& word_regions); 52 | 53 | private: 54 | Detector *detector; 55 | Recognizer *recognizer; 56 | }; 57 | 58 | } 59 | #endif 60 | -------------------------------------------------------------------------------- /test_images/img_108.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/test_images/img_108.jpg -------------------------------------------------------------------------------- /test_images/word_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/test_images/word_2.png -------------------------------------------------------------------------------- /text_box.cpp: -------------------------------------------------------------------------------- 1 | #include "text_box.h" 2 | 3 | TextBox::TextBox(std::vector& points, std::string text){ 4 | this->points = points; 5 | this->text = text; 6 | } 7 | 8 | void TextBox::get_rectangle_box(cv::Point& p1, cv::Point& p2){ 9 | int minx = 100000, miny = 100000, maxx = 0, maxy = 0; 10 | for(auto const& value: this->points){ 11 | minx = std::min(minx, value.x); 12 | miny = std::min(miny, value.y); 13 | maxx = std::max(maxx, value.x); 14 | maxy = std::max(maxy, value.y); 15 | } 16 | p1.x = minx; 17 | p1.y = miny; 18 | p2.x = maxx; 19 | p2.y = maxy; 20 | } 21 | 22 | std::ostream &operator<<(std::ostream &os, TextBox &m) { 23 | std::vector points = m.get_points(); 24 | os<<"oriented box: "; 25 | for(int i = 0; i < points.size(); i++){ 26 | os< 5 | #include 6 | #include 7 | #include 8 | 9 | //opencv 10 | #include 11 | #include "opencv2/opencv.hpp" 12 | 13 | class TextBox{ 14 | public: 15 | TextBox(std::vector& points, std::string text); 16 | void get_rectangle_box(cv::Point& p1, cv::Point& p2); 17 | std::vector& get_points() {return points;} 18 | std::string& get_text() {return text;} 19 | float& get_score() {return score;} 20 | void set_text(std::string s) {text=s;} 21 | private: 22 | std::vector points; 23 | std::string text; 24 | float score; 25 | }; 26 | 27 | std::ostream &operator<<(std::ostream &os, TextBox &m); 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #define PI 3.14159265358979323846 4 | 5 | void resize_image_max_len(const cv::Mat& image, cv::Mat& resized_image, float& ratio_h, float& ratio_w, int max_side_len){ 6 | int height = image.rows; 7 | int width = image.cols; 8 | float ratio = 1; 9 | if(std::max(height, width) > max_side_len) 10 | ratio = height > width ? float(max_side_len)/height: float(max_side_len)/width; 11 | int resize_h = int(height * ratio); 12 | int resize_w = int(width * ratio); 13 | resize_h = resize_h%32 == 0? resize_h : (resize_h/32 - 1) * 32; 14 | resize_w = resize_w%32 == 0? resize_w : (resize_w/32 - 1) * 32; 15 | cv::resize(image, resized_image, cv::Size(resize_w, resize_h)); 16 | 17 | ratio_h = float(resize_h)/height; 18 | ratio_w = float(resize_w)/width; 19 | } 20 | 21 | void resize_image_fix_height(const cv::Mat& image, cv::Mat& resized_image, float& ratio, int fixed_height){ 22 | int height = image.rows; 23 | int width = image.cols; 24 | ratio = float(fixed_height)/height; 25 | int resize_h = fixed_height; 26 | int resize_w = int(width * ratio); 27 | cv::resize(image, resized_image, cv::Size(resize_w, resize_h)); 28 | } 29 | 30 | void pad_image_width(const cv::Mat& image, cv::Mat& padded_image, int target_width){ 31 | int height = image.rows; 32 | int width = image.cols; 33 | int borderType = cv::BORDER_CONSTANT; 34 | if(width > target_width) 35 | cv::resize(image, padded_image, cv::Size(target_width, height)); 36 | else if(width < target_width){ 37 | int pad_len = target_width - width; 38 | copyMakeBorder(image, padded_image, 0, 0, 0, pad_len, borderType, cv::Scalar(0,0,0)); 39 | }else 40 | padded_image = image.clone(); 41 | } 42 | 43 | tensorflow::Tensor cv_mat_to_tensor(const cv::Mat& image){ 44 | int height = image.rows; 45 | int width = image.cols; 46 | int depth = 3; 47 | tensorflow::Tensor res_tensor(tensorflow::DT_UINT8, tensorflow::TensorShape({1, height, width, 3})); 48 | 49 | //we assume that the image is unsigned char dtype 50 | const unsigned char *source_data = (unsigned char*)(image.data); 51 | 52 | auto tensor_mapped = res_tensor.tensor(); 53 | for (int y = 0; y < height; ++y) { 54 | for (int x = 0; x < width; ++x) { 55 | auto b = source_data[image.step * y + x * image.channels()]; 56 | auto g = source_data[image.step * y + x * image.channels()+1]; 57 | auto r = source_data[image.step * y + x * image.channels()+2]; 58 | tensor_mapped(0, y, x, 0) = r; 59 | tensor_mapped(0, y, x, 1) = g; 60 | tensor_mapped(0, y, x, 2) = b; 61 | } 62 | } 63 | return res_tensor; 64 | } 65 | 66 | cv::Mat tensor_to_cv_mat(const tensorflow::Tensor tensor){ 67 | auto tensor_data = tensor.flat(); 68 | //assume it is a 4d tensor 69 | auto tensor_shape = tensor.shape(); 70 | int height = tensor_shape.dim_size(1); 71 | int width = tensor_shape.dim_size(2); 72 | std::cout<<" height "< points = text_box.get_points(); 91 | float offset_y = points[1].y - points[0].y; 92 | float offset_x = points[1].x - points[0].x; 93 | return atan2(offset_y, offset_x); 94 | } 95 | 96 | void get_cropped_extend_image(cv::Mat& image, TextBox& box, cv::Mat& cropped, std::vector& new_points){ 97 | cv::Point p1, p2; 98 | box.get_rectangle_box(p1, p2); 99 | int height = p2.y - p1.y; 100 | int width = p1.y - p1.x; 101 | 102 | int extend_len = std::max(height, width); 103 | int minx = std::max(0, p1.x - extend_len); 104 | int miny = std::max(0, p1.y - extend_len); 105 | int maxx = std::min(image.cols, p2.x + extend_len); 106 | int maxy = std::min(image.rows, p2.y + extend_len); 107 | 108 | std::vector points = box.get_points(); 109 | new_points.resize(points.size()); 110 | for(int i=0; i& points, 126 | float angle, cv::Mat& rotated_image, std::vector& rotated_points){ 127 | int height = cropped.rows, width = cropped.cols; 128 | cv::Point center(width/2, height/2); 129 | int min_side = std::min(height, width); 130 | auto M = cv::getRotationMatrix2D(center, angle * 180./PI, 1.0); 131 | cv::warpAffine(cropped, rotated_image, M, cv::Size(cropped.cols*2, cropped.rows*2)); 132 | 133 | //rotate the images 134 | rotated_points.resize(points.size()); 135 | for(int i=0; i& points){ 169 | std::vector colors={cv::Scalar(0,0,255), cv::Scalar(0,255,0), 170 | cv::Scalar(255,0,0), cv::Scalar(255,255,0), cv::Scalar(0,255,255), cv::Scalar(255,0,255)}; 171 | for(int j=0; j<4; j++){ 172 | cv::line(image, points[j], points[(j+1)%4], colors[j%4], 3); 173 | } 174 | } 175 | 176 | void draw_text_box(cv::Mat& image, TextBox& text_box){ 177 | //draw the polygon 178 | std::vector colors={cv::Scalar(0,0,255), cv::Scalar(0,255,0), 179 | cv::Scalar(255,0,0), cv::Scalar(255,255,0), cv::Scalar(0,255,255), cv::Scalar(255,0,255)}; 180 | draw_polygon(image, text_box.get_points()); 181 | //draw text above the left up corner 182 | cv::Point p1, p2; 183 | text_box.get_rectangle_box(p1, p2); 184 | cv::Point draw_loc(std::max(0, p1.x - 10), std::max(0, p1.y - 10)); 185 | cv::putText(image, text_box.get_text(), draw_loc, cv::FONT_HERSHEY_PLAIN, 1.3, cv::Scalar(0,255,255)); 186 | } 187 | -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | #ifndef Scene_Text_Utils_H 2 | #define Scene_Text_Utils_H 3 | 4 | #include "tensorflow/core/framework/tensor_shape.pb.h" 5 | #include "tensorflow/core/framework/tensor.h" 6 | #include 7 | 8 | //opencv 9 | #include 10 | #include "opencv2/opencv.hpp" 11 | #include "text_box.h" 12 | #include 13 | 14 | void resize_image_max_len(const cv::Mat& image, cv::Mat& resized_image, float& ratio_h, float& ratio_w, int max_side_len=800); 15 | 16 | void resize_image_fix_height(const cv::Mat& image, cv::Mat& resized_image, float& ratio, int fixed_height=32); 17 | 18 | void pad_image_width(const cv::Mat& image, cv::Mat& padded_image, int target_width=128); 19 | 20 | tensorflow::Tensor cv_mat_to_tensor(const cv::Mat& image); 21 | 22 | cv::Mat tensor_to_cv_mat(const tensorflow::Tensor tensor); 23 | 24 | float get_angle(TextBox& text_box); 25 | 26 | void get_cropped_extend_image(cv::Mat& image, TextBox& box, cv::Mat& cropped, std::vector& new_points); 27 | 28 | void rotate_image_and_points(cv::Mat& cropped, std::vector& points, float angle, cv::Mat& rotated_image, std::vector& rotated_points); 29 | 30 | cv::Point rotate_point(cv::Point& point, float angle, cv::Point& center); 31 | 32 | void draw_polygon(cv::Mat& image, std::vector& points); 33 | 34 | void draw_text_box(cv::Mat& image, TextBox& text_box); 35 | #endif 36 | --------------------------------------------------------------------------------