├── .gitignore
├── CMakeLists.txt
├── README.md
├── cmake
    └── Modules
    │   ├── Eigen.cmake
    │   ├── Eigen_VERSION.cmake
    │   └── FindTensorFlow.cmake
├── ctc_scene_text_recognizer.cpp
├── ctc_scene_text_recognizer.h
├── detector.h
├── faster_rcnn_text_detector.cpp
├── faster_rcnn_text_detector.h
├── images
    ├── 1.jpg
    ├── 2.jpg
    ├── 3.jpg
    ├── 4.jpg
    ├── 5.jpg
    └── pipeline.jpg
├── main.cpp
├── recognizer.h
├── scene_text_reader.cpp
├── scene_text_reader.h
├── test_images
    ├── img_108.jpg
    └── word_2.png
├── text_box.cpp
├── text_box.h
├── utils.cpp
└── utils.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | # 
35 | build/
36 | external/
37 | model/
38 | bin/
39 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project(SceneTextDetection)
 3 | set (CMAKE_CXX_STANDARD 11)
 4 | 
 5 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin)
 6 | 
 7 | if(NOT CMAKE_BUILD_TYPE)
 8 |   set(CMAKE_BUILD_TYPE Release)
 9 | endif()
10 | 
11 | set(CMAKE_CXX_FLAGS "-Wall -Wextra")
12 | set(CMAKE_CXX_FLAGS_DEBUG "-g")
13 | set(CMAKE_CXX_FLAGS_RELEASE "-O3")
14 | 
15 | set(SOURCE_FILES main.cpp faster_rcnn_text_detector.cpp ctc_scene_text_recognizer.cpp utils.cpp scene_text_reader.cpp text_box.cpp)
16 | set(EXECUTABLE DetectText)
17 | #set(EXECUTABLE EndtoEndReading)
18 | #set(EXECUTABLE RecognizeText)
19 | 
20 | # Add modules
21 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules")
22 | 
23 | find_package(TensorFlow REQUIRED)
24 | find_package(Protobuf REQUIRED)
25 | find_package(OpenCV REQUIRED highgui imgproc)
26 | 
27 | #set(TensorFlow_LIBRARIES "/usr/local/lib/libtensorflow_cc.so" "/usr/local/lib/libtensorflow_framework.so") 
28 | #set(TensorFlow_LIBRARIES "/usr/local/lib/libtensorflow_cc.so" "/usr/local/lib/libtensorflow.so") 
29 | set(TensorFlow_LIBRARIES "/usr/local/lib/libtensorflow.so" "/usr/local/lib/libtensorflow_cc.so" "/usr/local/lib/libtensorflow_framework.so")
30 | 
31 | message(STATUS "protobuf include dirs:" ${PROTOBUF_INCLUDE_DIRS})
32 | message(STATUS "protobuf libraries:" ${PROTOBUF_LIBRARIES})
33 | message(STATUS "tensorflow include dirs:" ${TensorFlow_INCLUDE_DIRS})
34 | message(STATUS "tensorflow libraries:" ${TensorFlow_LIBRARIES})
35 | 
36 | # set variables for external dependencies
37 | set(EXTERNAL_DIR "${PROJECT_SOURCE_DIR}/external" CACHE PATH "Location where external dependencies will installed")
38 | set(DOWNLOAD_LOCATION "${EXTERNAL_DIR}/src" CACHE PATH "Location where external projects will be downloaded")
39 | 
40 | #change the path to ur tesnorflow installed path
41 | #set(NSYNC_DIR "/usr/local/lib/python2.7/dist-packages/tensorflow/include/external/nsync/public")
42 | 
43 | mark_as_advanced(EXTERNAL_DIR DOWNLOAD_LOCATION)
44 | 
45 | include(Eigen)
46 | 
47 | set(PROJECT_INCLUDE_DIRS ${TensorFlow_INCLUDE_DIRS} ${EXTERNAL_DIR}/include ${PROTOBUF_INCLUDE_DIRS} ${NSYNC_DIR} ${OpenCV_INCLUDE_DIRS})
48 | set(PROJECT_LIBRARIES ${TensorFlow_LIBRARIES} ${PROTOBUF_LIBRARIES} ${OpenCV_LIBS})
49 | set(PROJECT_DEPENDENCIES Eigen)
50 | 
51 | include_directories(${PROJECT_INCLUDE_DIRS})
52 | add_library(SceneTextDetector faster_rcnn_text_detector.cpp utils.cpp)
53 | add_library(CTCSceneTextRecognizer ctc_scene_text_recognizer.cpp utils.cpp)
54 | add_library(SceneTextReader scene_text_reader.cpp faster_rcnn_text_detector.cpp ctc_scene_text_recognizer.cpp utils.cpp text_box.cpp)
55 | add_executable(${EXECUTABLE} ${SOURCE_FILES})
56 | target_link_libraries(${EXECUTABLE} ${PROJECT_LIBRARIES})
57 | add_dependencies(${EXECUTABLE} ${PROJECT_DEPENDENCIES})
58 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DeepSceneTextReader
  2 | This is a c++ project deploying a deep scene text reading pipeline. It reads text from natural scene images.
  3 | 
  4 | <p align="center">
  5 |   <img src="images/1.jpg" width=1280 height=720>
  6 |   <img src="images/2.jpg" width=288 height=200>
  7 |   <img src="images/3.jpg" width=256 height=200>
  8 |   <img src="images/4.jpg" width=256 height=200>
  9 | </p>
 10 | 
 11 | 
 12 | # Prerequsites
 13 | 
 14 | The project is written in c++ using tensorflow computational framework. It is tested using tensorflow 1.4. Newer version should be ok too, but not tested.
 15 | Please install:
 16 | 
 17 | * Tensorflow
 18 | 
 19 | * nsync project: https://github.com/google/nsync.git  This is needed for building tensorflow.
 20 | 
 21 | * opencv3.3
 22 | 
 23 | * protobuf
 24 | 
 25 | * eigen
 26 | 
 27 | Please check this project on how to build project using tensorflow with cmake:
 28 | https://github.com/cjweeks/tensorflow-cmake
 29 | It greatly helped the progress of building this project.
 30 | When building tensorflow library, please be careful since we need to use opencv. Looks like there is still problem when including tensorflow and opencv together.
 31 | It will make opencv unable to read image.
 32 | Check out this issue: https://github.com/tensorflow/tensorflow/issues/14267
 33 | The answer by allenlavoie solved my problem, so I paste it here:
 34 | 
 35 | "In the meantime, as long as you're not using any custom ops you can build libtensorflow_cc.so with bazel build --config=monolithic, which will condense everything together into one shared object (no libtensorflow_framework dependence) and seal off non-TensorFlow symbols. That shared object will have protocol buffer symbols."
 36 | 
 37 | # Status
 38 | Currently two pretrained model is provided. One for scene text detection, and one for scene text recognition.
 39 | More model will be provided.
 40 | Note that the current model is not so robust. U can easily change to ur trained model.
 41 | The models will be continuously updated.
 42 | 
 43 | # build process
 44 | 
 45 | cd build
 46 | 
 47 | cmake ..
 48 | 
 49 | make
 50 | 
 51 | It will create an excutable named **DetectText** in bin folder.
 52 | 
 53 | # Usage:
 54 | The excutable could be excuted in three modes:  (1) Detect  (2) Recognize  (3) Detect and Recognize
 55 | 
 56 | ## Detect
 57 | Download the pretrained detector model and put it in model/
 58 | 
 59 | ./DetectText --detector_graph='model/Detector_model.pb' \
 60 |    --image_filename='test_images/test_img1.jpg' --mode='detect' --output_filename='results/output_image.jpg'
 61 | 
 62 | ## Recognize
 63 | Download the pretrained recognizer model and put it in model/
 64 | Download the dictionary file and put it in model
 65 | 
 66 | 
 67 | ./DetectText --recognizer_graph='model/Recognizer_model.pb'  \
 68 |    --image_filename='test_images/recognize_image1.jpg' --mode='recognize' \
 69 |    --im_height=32  --im_width=128
 70 | 
 71 | ## Detect and Recognize
 72 | Download the pretrained detector and recognizer model and put it in model/ as described previously.
 73 | 
 74 | ./DetectText --recognizer_graph=$recognizer_graph --detector_graph='model/Detector_model.pb' \
 75 |    --image_filename='model/Recognizer_model.pb' --mode='detect_and_read' --output_filename='results/output_image.jpg' 
 76 | 
 77 | # Model Description
 78 | ### *Detector*
 79 | 1. Faster RCNN Detector Model
 80 | The detector is trained with modified tensorflow [object detector api]: (https://github.com/tensorflow/models/tree/master/research/object_detection)
 81 | I modify it by changing the proposal scheme to regress to the 4 coordinates of the oriented bounding box rather than regular rectangular bounding box.
 82 | Check out this [repo](https://github.com/dafanghe/Tensorflow_SceneText_Oriented_Box_Predictor) for the training code.
 83 | Pretrained model: FasterRCNN_detector_model.pb
 84 | 
 85 | 2. R2CNN will be updated. See [R2CNN](https://arxiv.org/abs/1706.09579) for details.
 86 | The code is also modified with tnesorflow [object detector api]: (https://github.com/tensorflow/models/tree/master/research/object_detection)
 87 | The training code will be released soon.
 88 | 
 89 | 
 90 | ### *Recognizer*
 91 | 1. CTC scene text recognizer.
 92 | The recognizer model follows the famous scene text recognition [CRNN model](https://arxiv.org/abs/1507.05717)
 93 | 
 94 | 2. Spatial Attention OCR will be updated soon. It is based on [GoogleOCR](https://github.com/tensorflow/models/tree/master/research/attention_ocr)
 95 | 
 96 | ### *Detect and Recognize*
 97 | The whole scene text reading pipeline detects the text and rotate it horizontally and read it with recognizer.
 98 | The pipeline is here:
 99 | 
100 | <p align="center">
101 |   <img src="images/pipeline.jpg" width=1280 height=436>
102 | </p>
103 | 
104 | ### *Pretrained Models*
105 | You can play with the code with provided pretrained models. \
106 | They are not fully optimized yet, but could be used for being familiar with the code. \
107 | Check them out here: [models](https://drive.google.com/drive/folders/1Ao0ZrSVf0YjU6pnzGY0C3QJ2Qz0ljRIU?usp=sharing) 
108 | 
109 | You will find two detection models called: (1) **FasterRCNN_detector_model.pb** (2) **R2CNN_detector_model.pb** \
110 | Two recognition models with their charset: (1) **Recognizer_model.pb + charset_full.txt** and (2)**Recognizer_model_case_insen.pb + charset_case_insen.txt**. \
111 | Full charset means English letters + digit and case insen means case insensitive English letters + digit.
112 | Let me know if u have any problens using them.
113 | 
114 | 
115 | # Reference and Related Projects
116 | - [Faster RCNN](https://arxiv.org/abs/1506.01497) Faster RCNN paper.
117 | - [Tensorflow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection).
118 | - [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/abs/1507.05717), reference paper for CRNN model.
119 | - [tensorflow-cmake](https://github.com/cjweeks/tensorflow-cmake), Tutorial of Building Project with tensorflow using cmake.
120 | - [R2CNN](https://arxiv.org/abs/1706.09579) Reference paper for R2CNN.
121 | 
122 | # Contact:
123 | 
124 | * Dafang He. The Penn State University.  hdfcraig@gmail.com   http://personal.psu.edu/duh188/
125 | 


--------------------------------------------------------------------------------
/cmake/Modules/Eigen.cmake:
--------------------------------------------------------------------------------
 1 | include(ExternalProject)
 2 | include(Eigen_VERSION)
 3 | 
 4 | set(Eigen_INSTALL ${EXTERNAL_DIR}/include/eigen/${Eigen_DIR})
 5 | 
 6 | set(Eigen_INCLUDE_DIRS
 7 |         ${PROJECT_SOURCE_DIR}/external/include/eigen
 8 |         ${Eigen_INSTALL})
 9 | 
10 | ExternalProject_Add(Eigen
11 |         PREFIX ${PROJECT_SOURCE_DIR}/external/src/eigen
12 |         URL ${Eigen_URL}
13 |         URL_HASH ${Eigen_HASH}
14 |         DOWNLOAD_DIR ${DOWNLOAD_LOCATION}
15 | 
16 |         CMAKE_ARGS
17 |         -DCMAKE_BUILD_TYPE:STRING=Release
18 |         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
19 |         -DCMAKE_INSTALL_PREFIX:STRING=${Eigen_INSTALL}
20 |         -DINCLUDE_INSTALL_DIR:STRING=${Eigen_INSTALL})
21 | 
22 | include_directories(${Eigen_INCLUDE_DIRS})
23 | 


--------------------------------------------------------------------------------
/cmake/Modules/Eigen_VERSION.cmake:
--------------------------------------------------------------------------------
1 | #set(Eigen_URL https://bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz,http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz)
2 | set(Eigen_URL http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz)
3 | set(Eigen_ARCHIVE_HASH 429aa5254200)
4 | set(Eigen_HASH SHA256=61d8b6fc4279dd1dda986fb1677d15e3d641c07a3ea5abe255790b1f0c0c14e9)
5 | set(Eigen_DIR eigen3)
6 | set(Eigen_INSTALL_DIR /usr/local)
7 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindTensorFlow.cmake:
--------------------------------------------------------------------------------
 1 | # Locates the tensorFlow library and include directories.
 2 | 
 3 | include(FindPackageHandleStandardArgs)
 4 | unset(TENSORFLOW_FOUND)
 5 | 
 6 | find_path(TensorFlow_INCLUDE_DIR
 7 |         NAMES
 8 |         tensorflow/core
 9 |         tensorflow/cc
10 |         third_party
11 |         HINTS
12 |         /usr/local/include/google/tensorflow
13 |         /usr/include/google/tensorflow)
14 | 
15 | find_library(TensorFlow_LIBRARY
16 |         NAMES
17 |         tensorflow_cc
18 |         HINTS
19 |         /usr/lib
20 |         /usr/local/lib)
21 | 
22 | #tensorflow_all
23 | # set TensorFlow_FOUND
24 | find_package_handle_standard_args(TensorFlow DEFAULT_MSG TensorFlow_INCLUDE_DIR TensorFlow_LIBRARY)
25 | 
26 | # set external variables for usage in CMakeLists.txt
27 | if(TENSORFLOW_FOUND)
28 |     set(TensorFlow_LIBRARIES ${TensorFlow_LIBRARY})
29 |     set(TensorFlow_INCLUDE_DIRS ${TensorFlow_INCLUDE_DIR})
30 | endif()
31 | 
32 | # hide locals from GUI
33 | mark_as_advanced(TensorFlow_INCLUDE_DIR TensorFlow_LIBRARY)
34 | 


--------------------------------------------------------------------------------
/ctc_scene_text_recognizer.cpp:
--------------------------------------------------------------------------------
  1 | #include "ctc_scene_text_recognizer.h"
  2 | 
  3 | 
  4 | CTCSceneTextRecognizer::CTCSceneTextRecognizer(){
  5 |   init_constant_vars(); 
  6 | }
  7 | 
  8 | 
  9 | CTCSceneTextRecognizer::CTCSceneTextRecognizer(std::string frozen_graph_filename, std::string dictionary_filename, int _im_height, int _im_width):Recognizer(frozen_graph_filename, dictionary_filename){
 10 |   init_constant_vars(_im_height, _im_width);
 11 | }
 12 | 
 13 | 
 14 | bool CTCSceneTextRecognizer::init(const std::string frozen_graph_filename, const std::string dictionary_filename){
 15 |   this->init_graph(frozen_graph_filename); 
 16 |   this->init_dictionary(dictionary_filename);
 17 |   return true;
 18 | }
 19 | 
 20 | 
 21 | void CTCSceneTextRecognizer::init_constant_vars(int _im_height, int _im_width){
 22 |   std::string input_layer_string = "input_images:0,input_seq_lens:0";
 23 |   std::string output_layer_string = "CTCBeamSearchDecoder:0,CTCBeamSearchDecoder:1,CTCBeamSearchDecoder:2";
 24 |   this->input_layers = str_util::Split(input_layer_string, ',');
 25 |   this->output_layers = str_util::Split(output_layer_string, ',');
 26 |   this->seq_len = 29;
 27 |   this->image_width = _im_width;  //input image width;
 28 |   this->image_height = _im_height;  //input image height
 29 |   this->width_scale_ratio = 1.2; //scale the width for better recognition
 30 | }
 31 | 
 32 |     
 33 | void CTCSceneTextRecognizer::preprocess_image(cv::Mat& input_image, cv::Mat& output_image){
 34 |   cv::Mat resized_image, padded_image;
 35 |   int new_width = int(this->width_scale_ratio * input_image.cols);
 36 |   cv::resize(input_image, input_image, cv::Size(new_width, input_image.rows));
 37 |   float ratio=0;
 38 |   resize_image_fix_height(input_image, resized_image, ratio, this->image_height);
 39 |   pad_image_width(resized_image, output_image, this->image_width);
 40 | }
 41 | 
 42 | 
 43 | string CTCSceneTextRecognizer::run_graph(const cv::Mat& image){
 44 |   int height = image.rows;
 45 |   int width = image.cols;
 46 |   Tensor input_img_tensor(DT_FLOAT, TensorShape({1, height, width, 3}));
 47 | 
 48 |   unsigned char *input_data = (unsigned char*)(image.data); 
 49 |   auto input_tensor_mapped = input_img_tensor.tensor<float, 4>();
 50 |   //(TODO) is there any other ways to copy the data into tensor?
 51 |   for (int y = 0;y < height; ++y) {
 52 |     for (int x = 0;x < width; ++x) {
 53 |       unsigned char b = input_data[image.step * y + x * image.channels()];
 54 |       unsigned char g = input_data[image.step * y + x * image.channels() + 1];
 55 |       unsigned char r = input_data[image.step * y + x * image.channels() + 2];
 56 |       input_tensor_mapped(0, y, x, 0) = float(r);
 57 |       input_tensor_mapped(0, y, x, 1) = float(g);
 58 |       input_tensor_mapped(0, y, x, 2) = float(b);
 59 |     }
 60 |   }
 61 |   //create the seq len tensor and assign fixed value
 62 |   Tensor input_seq_len_tensor(DT_INT32, TensorShape({1}));
 63 |   auto input_seq_len_mapped = input_seq_len_tensor.tensor<int, 1>();
 64 |   input_seq_len_mapped(0) = this->seq_len;
 65 | 
 66 |   //create the input to run
 67 |   std::vector<std::pair<string, Tensor> > inputs = {
 68 |     {this->input_layers[0], input_img_tensor}, 
 69 |     {this->input_layers[1], input_seq_len_tensor},
 70 |   };
 71 | 
 72 |   std::vector<Tensor> outputs;
 73 |   Status run_status = this->session->Run(inputs,
 74 |             this->output_layers, {}, &outputs);
 75 |   if (!run_status.ok()) {
 76 |     LOG(ERROR) << "Running model failed: " << run_status;
 77 |     return "";
 78 |   }
 79 |   LOG(INFO) <<"number of output:"<<outputs.size();
 80 |   
 81 |   auto indices = outputs[0].flat_outer_dims<long long>();
 82 |   auto values = outputs[1].flat_outer_dims<long long>();
 83 |   
 84 |   const Eigen::Tensor<float, indices.NumDimensions>::Dimensions& indices_dim = indices.dimensions();
 85 |   const Eigen::Tensor<float, values.NumDimensions>::Dimensions& values_dim = values.dimensions();
 86 |  
 87 |   LOG(INFO) << outputs[0].DebugString();
 88 |   LOG(INFO) << outputs[1].DebugString();
 89 |   std::vector<int> encoded_text;
 90 |   for(int i=0; i<values_dim[0]; i++){
 91 |     for(int j=0; j<values_dim[1]; j++){
 92 |       encoded_text.push_back(values(i,j));
 93 |     }
 94 |   }
 95 |   std::string decoded_text = decode_single_text(encoded_text); 
 96 |   return decoded_text;
 97 | }
 98 |     
 99 | 
100 | std::vector<cv::Mat> CTCSceneTextRecognizer::preprocess_images(std::vector<cv::Mat>& input_images){
101 |   std::vector<cv::Mat> processed_images(input_images.size());
102 |   for(int i=0; i<input_images.size(); i++){
103 |     cv::Mat preprocessed_image;
104 |     this->preprocess_image(input_images[i], preprocessed_image);
105 |     processed_images[i] = preprocessed_image;
106 |   }
107 |   return processed_images;
108 | }
109 |     
110 | 
111 | std::vector<std::string> CTCSceneTextRecognizer::run_graph(const std::vector<cv::Mat> images){
112 |   //the images must be preprocessd and has the same height and width!!
113 |   std::vector<std::string> res;
114 |   int num_word = images.size();
115 |   if(num_word == 0) return res;
116 | 
117 |   int height = this->image_height;
118 |   int width = this->image_width;
119 |   Tensor input_img_tensor(DT_FLOAT, TensorShape({num_word, height, width, 3}));
120 |   auto input_tensor_mapped = input_img_tensor.tensor<float, 4>();
121 |   //create the seq len tensor and assign fixed value for ctc
122 |   Tensor input_seq_len_tensor(DT_INT32, TensorShape({num_word}));
123 |   auto input_seq_len_mapped = input_seq_len_tensor.tensor<int, 1>();
124 | 
125 |   for(int i=0; i<num_word; i++){
126 |     const cv::Mat& image = images[i];
127 |     //std::cout<<"assign image to tensor"<<i<<" "<<image.rows<<" "<<image.cols<<std::endl;
128 |     assert (image.rows == height);
129 |     assert (image.cols == width);
130 |     const unsigned char *input_data = (const unsigned char*)(image.data); 
131 |     //(TODO) is there any other ways to copy the data into tensor?
132 |     for (int y = 0;y < height; ++y) {
133 |       for (int x = 0;x < width; ++x) {
134 |         const unsigned char b = input_data[image.step * y + x * image.channels()];
135 |         const unsigned char g = input_data[image.step * y + x * image.channels() + 1];
136 |         const unsigned char r = input_data[image.step * y + x * image.channels() + 2];
137 |         input_tensor_mapped(i, y, x, 0) = float(r);
138 |         input_tensor_mapped(i, y, x, 1) = float(g);
139 |         input_tensor_mapped(i, y, x, 2) = float(b);
140 |       }
141 |     }
142 |     input_seq_len_mapped(i) = this->seq_len;
143 |   }
144 |   //create the input to run
145 |   std::vector<std::pair<string, Tensor> > inputs = {
146 |     {this->input_layers[0], input_img_tensor}, 
147 |     {this->input_layers[1], input_seq_len_tensor},
148 |   };
149 | 
150 |   //std::cout<<"run recognition graph"<<std::endl;
151 |   std::vector<Tensor> outputs;
152 |   Status run_status = this->session->Run(inputs,
153 |             this->output_layers, {}, &outputs);
154 |   if (!run_status.ok()) {
155 |     LOG(ERROR) << "Running model failed: " << run_status;
156 |     return res;
157 |   }
158 |   LOG(INFO) <<"number of output:"<<outputs.size();
159 |  
160 |   //std::cout<<outputs[0].DebugString()<<std::endl;
161 |   //std::cout<<outputs[1].DebugString()<<std::endl;
162 |   auto indices_shape = outputs[0].shape();
163 | 
164 |   auto indices = outputs[0].tensor<long long, 2>();
165 |   auto values = outputs[1].tensor<long long, 1>();
166 |   
167 |   //const Eigen::Tensor<float, indices.NumDimensions>::Dimensions& indices_dim = indices.dimensions();
168 |   //const Eigen::Tensor<float, values.NumDimensions>::Dimensions& values_dim = values.dimensions();
169 | 
170 |   std::vector<std::vector<int> > encoded_texts(num_word);
171 |   for(int i=0; i<indices_shape.dim_size(0); i++){
172 |     encoded_texts[indices(i, 0)].push_back(values(i));
173 |   }
174 |   
175 |   for(int i=0; i<num_word; i++){
176 |     res.push_back(decode_single_text(encoded_texts[i]));
177 |   }
178 |   return res;
179 | }
180 | 


--------------------------------------------------------------------------------
/ctc_scene_text_recognizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef CTC_Scene_Text_Recognizer_H
 2 | #define CTC_Scene_Text_Recognizer_H
 3 | 
 4 | #include <iostream>
 5 | #include <memory>
 6 | #include <string>
 7 | #include <vector>
 8 | #include <unordered_map>
 9 | 
10 | #include "tensorflow/core/framework/tensor_shape.pb.h"
11 | #include "tensorflow/core/framework/tensor.h"
12 | #include "tensorflow/core/graph/graph.h"
13 | #include "tensorflow/core/lib/strings/str_util.h"
14 | #include "tensorflow/core/platform/logging.h"
15 | #include "tensorflow/core/platform/platform.h"
16 | #include "tensorflow/core/platform/types.h"
17 | 
18 | //opencv
19 | #include <opencv2/core.hpp>
20 | #include "opencv2/opencv.hpp"
21 | 
22 | #include "utils.h"
23 | 
24 | #include "recognizer.h"
25 | 
26 | using namespace tensorflow;
27 | 
28 | 
29 | class CTCSceneTextRecognizer: public Recognizer{
30 |   public:
31 |     CTCSceneTextRecognizer();
32 | 
33 |     CTCSceneTextRecognizer(const std::string frozen_graph_filename, const std::string dictionary_filename,
34 |             int _im_height=32, int _im_width=128);
35 |     
36 |     bool init(const std::string frozen_graph_filename, const std::string);
37 |     void preprocess_image(cv::Mat& input_image, cv::Mat& output_image);
38 |     std::vector<cv::Mat> preprocess_images(std::vector<cv::Mat>& input_images);
39 |     std::string run_graph(const cv::Mat& image);
40 |     std::vector<std::string> run_graph(const std::vector<cv::Mat> input_images);
41 | 
42 |   private:
43 |     void init_constant_vars(int _im_height=32, int _im_width=128);
44 |     float width_scale_ratio;
45 |     int seq_len;
46 |     int image_width;
47 |     int image_height;
48 | };
49 | 
50 | #endif 
51 | 


--------------------------------------------------------------------------------
/detector.h:
--------------------------------------------------------------------------------
 1 | #ifndef Detector_H
 2 | #define Detector_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | 
 7 | //opencv
 8 | #include <opencv2/core.hpp>
 9 | #include "opencv2/opencv.hpp"
10 | 
11 | //tensorflow
12 | #include "tensorflow/core/platform/logging.h"
13 | #include "tensorflow/core/public/session.h"
14 | #include "tensorflow/core/framework/graph.pb.h"
15 | #include "tensorflow/core/graph/graph.h"
16 | #include "tensorflow/core/public/session.h"
17 | #include "tensorflow/core/lib/strings/str_util.h"
18 | 
19 | #include "text_box.h"
20 | 
21 | 
22 | class Detector{
23 |   public:
24 |     Detector(){};
25 |     Detector(const std::string frozen_graph_filename){
26 |       init_graph(frozen_graph_filename);
27 |     }
28 |     bool init_graph(const std::string& frozen_graph_filename){
29 |       if (!ReadBinaryProto(tensorflow::Env::Default(), frozen_graph_filename, &graph_def).ok()) {
30 |         LOG(ERROR) << "error when reading proto" << frozen_graph_fliename;
31 |         return -1;
32 |       } 
33 |       
34 |       tensorflow::SessionOptions sess_opt;
35 |       sess_opt.config.mutable_gpu_options()->set_allow_growth(true);
36 |       (&session)->reset(tensorflow::NewSession(sess_opt));
37 |       if (!session->Create(graph_def).ok()) {
38 |         LOG(ERROR) << "error create graph";
39 |         return -1;
40 |       }
41 |     }
42 |     virtual int run_graph(const cv::Mat& image, std::vector<TextBox>& results) = 0;
43 | 
44 |     tensorflow::GraphDef graph_def;
45 |     std::string input_layer; //for detector, we assume there is only one input
46 |     std::unique_ptr<tensorflow::Session> session;
47 |     std::vector<std::string> output_layers;
48 | };
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/faster_rcnn_text_detector.cpp:
--------------------------------------------------------------------------------
 1 | #include "faster_rcnn_text_detector.h"
 2 | 
 3 | 
 4 | FasterRCNNTextDetector::FasterRCNNTextDetector(const std::string frozen_graph_filename): Detector(frozen_graph_filename) {
 5 |   this->init_constants();
 6 | }
 7 | 
 8 | 
 9 | bool FasterRCNNTextDetector::init_constants(){
10 |   input_layer = "image_tensor:0";
11 |   output_layers = str_util::Split("detection_boxes:0,detection_scores:0,detection_classes:0,detection_oriented_boxes:0,num_detections:0", ',');
12 |   score_thresh = 0.6;
13 | }
14 | 
15 | 
16 | int FasterRCNNTextDetector::run_graph(const cv::Mat& image, std::vector<TextBox>& results){
17 |   cv::Mat resized_image;
18 |   float ratio_h=0, ratio_w=0;
19 |   resize_image_max_len(image, resized_image, ratio_h, ratio_w);
20 |   auto input_tensor = cv_mat_to_tensor(resized_image);
21 | 
22 |   std::vector<Tensor> outputs;
23 |   Status run_status = this->session->Run({{this->input_layer, input_tensor}},
24 |                                    this->output_layers, {}, &outputs);
25 |   if (!run_status.ok()) {
26 |     LOG(ERROR) << "Running model failed: " << run_status;
27 |     return -1;
28 |   }
29 |   LOG(INFO) <<"number of output:"<<outputs.size();
30 | 
31 |   auto detection_boxes = outputs[0].tensor<float, 3>();
32 |   auto detection_scores = outputs[1].tensor<float, 2>();
33 |   auto detection_classes = outputs[2].tensor<float, 2>();
34 |   auto detection_oriented_boxes = outputs[3].tensor<float, 4>();
35 | 
36 |   int num_box = detection_boxes.dimension(1);
37 |   for(int i=0;i<num_box;i++){
38 |     if(detection_scores(0, i) > this->score_thresh){
39 |       std::vector<cv::Point> points;
40 |       for(int j=0; j<4; j++){
41 |         cv::Point p;
42 |         p.x = int(detection_oriented_boxes(0, i, j, 1) * image.cols);
43 |         p.y = int(detection_oriented_boxes(0, i, j, 0) * image.rows);
44 |         points.push_back(p);
45 |       }
46 |       TextBox tb(points, "");
47 |       results.push_back(tb);
48 |     }
49 |   }
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/faster_rcnn_text_detector.h:
--------------------------------------------------------------------------------
 1 | #ifndef FasterRCNN_Text_Detector_H
 2 | #define FasterRCNN_Text_Detector_H
 3 | 
 4 | #include <iostream>
 5 | #include <memory>
 6 | #include <string>
 7 | #include <vector>
 8 | 
 9 | //tensorflow
10 | #include "tensorflow/core/framework/graph.pb.h"
11 | #include "tensorflow/core/framework/tensor_shape.pb.h"
12 | #include "tensorflow/core/framework/tensor.h"
13 | #include "tensorflow/core/graph/graph.h"
14 | #include "tensorflow/core/lib/strings/str_util.h"
15 | #include "tensorflow/core/platform/logging.h"
16 | #include "tensorflow/core/platform/platform.h"
17 | #include "tensorflow/core/platform/types.h"
18 | #include "tensorflow/core/public/session.h"
19 | 
20 | //opencv
21 | #include <opencv2/core.hpp>
22 | #include "opencv2/opencv.hpp"
23 | 
24 | #include "detector.h"
25 | #include "text_box.h"
26 | #include "utils.h"
27 | 
28 | using namespace tensorflow;
29 | 
30 | 
31 | class FasterRCNNTextDetector: public Detector{
32 |   public:
33 |     FasterRCNNTextDetector(){};
34 | 
35 |     FasterRCNNTextDetector(const std::string frozen_graph_filename);
36 |     
37 |     bool init_constants();
38 |     int run_graph(const cv::Mat& image, std::vector<TextBox>& results);
39 | 
40 |   private:
41 |     float score_thresh;
42 | };
43 | 
44 | #endif 
45 | 


--------------------------------------------------------------------------------
/images/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/1.jpg


--------------------------------------------------------------------------------
/images/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/2.jpg


--------------------------------------------------------------------------------
/images/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/3.jpg


--------------------------------------------------------------------------------
/images/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/4.jpg


--------------------------------------------------------------------------------
/images/5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/5.jpg


--------------------------------------------------------------------------------
/images/pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/images/pipeline.jpg


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <memory>
  3 | #include <string>
  4 | #include <vector>
  5 | #include <opencv2/core.hpp>
  6 | 
  7 | #include "tensorflow/core/platform/init_main.h"
  8 | #include "tensorflow/core/util/command_line_flags.h"
  9 | #include "faster_rcnn_text_detector.h"
 10 | #include "ctc_scene_text_recognizer.h"
 11 | #include "scene_text_reader.h"
 12 | #include "utils.h"
 13 | 
 14 | int detect_text(string& detector_graph_filename, string& image_filename, string& output_filename)
 15 | {
 16 |   LOG(INFO)<<"start text detection:";
 17 | 
 18 |   FasterRCNNTextDetector detector(detector_graph_filename);
 19 | 
 20 |   cv::Mat image = cv::imread(image_filename);
 21 |   if(!image.data)                              // Check for invalid input
 22 |   {
 23 |       LOG(ERROR) <<  "Could not open or find the image " << image_filename;
 24 |       return -1;
 25 |   } 
 26 |   std::vector<cv::Scalar> colors={cv::Scalar(0,0,255), cv::Scalar(0,255,0),
 27 |     cv::Scalar(255,0,0), cv::Scalar(255,255,0), cv::Scalar(0,255,255), cv::Scalar(255,0,255)};
 28 |   std::vector<TextBox> res;
 29 |   detector.run_graph(image, res);
 30 |   for(int i=0; i<res.size(); i++){
 31 |     std::vector<cv::Point> points = res[i].get_points(); 
 32 |     for(int j=0; j<4; j++){
 33 |       cv::line(image, points[j], points[(j+1)%4], colors[j%4], 3); 
 34 |     }
 35 |   }
 36 |   
 37 |   //write image
 38 |   cv::imwrite(output_filename, image);
 39 |   return 0;
 40 | }
 41 | 
 42 | 
 43 | int recognize_text(string& recognizer_graph_filename, string& dictionary_filename,
 44 |     string& image_filename, int im_height=32, int im_width=128)
 45 | {
 46 |   LOG(INFO) <<"start text recognition: "<<recognizer_graph_filename;
 47 |   CTCSceneTextRecognizer recognizer(recognizer_graph_filename, dictionary_filename, im_height, im_width);
 48 |   cv::Mat image = cv::imread(image_filename);
 49 |   if(!image.data)                              // Check for invalid input
 50 |   {
 51 |     LOG(ERROR) <<  "Could not open or find the image " << image_filename;
 52 |     return -1;
 53 |   }
 54 |   LOG(INFO)<<" read text image: "<<image.rows<<" "<<image.cols;
 55 |   cv::Mat preprocessed_image;
 56 |   recognizer.preprocess_image(image, preprocessed_image);
 57 |   string res = recognizer.run_graph(preprocessed_image);
 58 |   LOG(INFO)<<"prediction : "<<res;
 59 |   return 0;
 60 | }
 61 | 
 62 | 
 63 | int end_to_end_reading(string& detector_graph_filename, string& recognizer_graph_filename,
 64 |     string& dictionary_filename, string& image_filename, string& output_filename)
 65 | {
 66 |   scene_text_reader::SceneTextReader reader(detector_graph_filename,
 67 |       recognizer_graph_filename, dictionary_filename); 
 68 | 
 69 |   cv::Mat image = cv::imread(image_filename);
 70 |   if(!image.data)                              // Check for invalid input
 71 |   {
 72 |       LOG(ERROR) <<  "Could not open or find the image " << image_filename;
 73 |       return -1;
 74 |   } 
 75 |   std::vector<TextBox> res;
 76 |   reader.read_text(image, res);
 77 |   for(int i=0; i<res.size(); i++){
 78 |     std::cout<<res[i];
 79 |     //draw_polygon(image, res[i].get_points());
 80 |     draw_text_box(image, res[i]);
 81 |   } 
 82 |   cv::imwrite(output_filename, image);
 83 | }
 84 | 
 85 | 
 86 | int main(int argc, char** argv) {
 87 |   //do text detection
 88 |   string detector_graph = "";
 89 |   string recognizer_graph = "";
 90 |   string dictionary_filename = "";
 91 |   string image_filename = "";
 92 |   string output_filename = "";
 93 |   int im_height, im_width;
 94 |   string mode = "";
 95 |   std::vector<Flag> flag_list = {
 96 |     Flag("detector_graph", &detector_graph, "detector graph file name"),
 97 |     Flag("recognizer_graph", &recognizer_graph, "recognizer graph file name"),
 98 |     Flag("im_height", &im_height, "image height for recognition model"),
 99 |     Flag("im_width", &im_width, "image width for recognition model"),
100 |     Flag("dictionary_filename", &dictionary_filename, "dictionary filename for decode the recognition results"),
101 |     Flag("image_filename", &image_filename, "the filename to be tested."),
102 |     Flag("output_filename", &output_filename, "the output filename"),
103 |     Flag("mode", &mode, "the mode, must be within the three categories: detect, recognize, detect_and_read"),
104 |   };
105 | 
106 |   string usage = Flags::Usage(argv[0], flag_list);
107 |   const bool parse_result = Flags::Parse(&argc, argv, flag_list);
108 | 
109 |   if (!parse_result) {
110 |     LOG(ERROR) << usage;
111 |     return -1;
112 |   }
113 | 
114 |   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
115 |   if (argc > 1) {
116 |     LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
117 |     return -1;
118 |   }
119 |   
120 |   if(mode == "detect"){
121 |     detect_text(detector_graph, image_filename, output_filename);
122 |   }else if(mode == "recognize"){
123 |     recognize_text(recognizer_graph, dictionary_filename, image_filename, im_height, im_width);
124 |   }else if(mode == "detect_and_read"){
125 |     end_to_end_reading(detector_graph, recognizer_graph,
126 |       dictionary_filename, image_filename, output_filename);
127 |   }else{
128 |     LOG(ERROR) << "mode should be within: detect, recognize, detect_and_read";
129 |   }
130 | }
131 | 


--------------------------------------------------------------------------------
/recognizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef Recognizer_H
 2 | #define Recognizer_H
 3 | 
 4 | #include <vector>
 5 | #include <unordered_map>
 6 | #include <string>
 7 | 
 8 | //opencv
 9 | #include <opencv2/core.hpp>
10 | #include "opencv2/opencv.hpp"
11 | 
12 | //tensorflow
13 | #include "tensorflow/core/platform/logging.h"
14 | #include "tensorflow/core/public/session.h"
15 | #include "tensorflow/core/framework/graph.pb.h"
16 | #include "tensorflow/core/graph/graph.h"
17 | #include "tensorflow/core/public/session.h"
18 | #include "tensorflow/core/lib/strings/str_util.h"
19 | 
20 | 
21 | class Recognizer{
22 |   //A base class should implemented the following functions:
23 |   //Preprocess_image: preprocess a single image represented as an opencv mat
24 |   //Preprocess images: preprocess a vector of opencv mat images
25 |   public:
26 |     Recognizer(){};
27 |     Recognizer(const std::string& recognizer_graph_filename, const std::string& dictionary_filename){
28 |       init_dictionary(dictionary_filename);
29 |       init_graph(recognizer_graph_filename);
30 |     };
31 |     bool init_graph(const std::string& frozen_graph_filename){
32 |       if (!ReadBinaryProto(tensorflow::Env::Default(), frozen_graph_filename, &graph_def).ok()) {
33 |         LOG(ERROR) << "Read proto";
34 |         return -1;
35 |       } 
36 |       
37 |       tensorflow::SessionOptions sess_opt;
38 |       sess_opt.config.mutable_gpu_options()->set_allow_growth(true);
39 |       (&session)->reset(tensorflow::NewSession(sess_opt));
40 |       if (!session->Create(graph_def).ok()) {
41 |         LOG(ERROR) << "Create graph";
42 |         return -1;
43 |       }
44 |     }
45 |     bool init_dictionary(const std::string& filename){
46 |       std::ifstream inf(filename, std::ios::in);
47 |       if(!inf.is_open())
48 |       { LOG(ERROR)<<"Error dictionary opening file "<<filename; std::exit(1); }
49 | 
50 |       LOG(INFO) <<"read dictionary file "<<filename;
51 |       std::string line;
52 |       std::vector<std::string> splits;
53 |       while(!inf.eof()){
54 |         inf>>line;
55 |         splits = tensorflow::str_util::Split(line, ',');
56 |         this->mapping[std::stoi(splits[0])] = splits[1][0];
57 |       }
58 |       inf.close();
59 |       return 1;
60 |     }
61 |     virtual void preprocess_image(cv::Mat& input_image, cv::Mat& output_image) = 0;
62 |     virtual std::vector<cv::Mat> preprocess_images(std::vector<cv::Mat>& input_images) = 0;
63 |     virtual std::string run_graph(const cv::Mat& image) = 0;
64 |     virtual std::vector<std::string> run_graph(const std::vector<cv::Mat> input_images) = 0;
65 |     std::string decode_single_text(std::vector<int>& vec){
66 |       std::string res;
67 |       for(int i=0; i<vec.size(); i++){
68 |         res.push_back(this->mapping[vec[i]]);
69 |       }
70 |       return res;
71 |     }
72 |     std::unordered_map<int, char> mapping;
73 |     tensorflow::GraphDef graph_def;
74 |     std::unique_ptr<tensorflow::Session> session;
75 |     std::vector<std::string> input_layers;
76 |     std::vector<std::string> output_layers;
77 | };
78 | 
79 | #endif
80 | 


--------------------------------------------------------------------------------
/scene_text_reader.cpp:
--------------------------------------------------------------------------------
 1 | #include "scene_text_reader.h"
 2 | 
 3 | namespace scene_text_reader{
 4 | 
 5 |   SceneTextReader::SceneTextReader(){
 6 | 
 7 |   }
 8 | 
 9 |   SceneTextReader::SceneTextReader(const std::string& detector_graph_filename, const string& recognizer_graph_filename,
10 |      const std::string& detector_model, const std::string& dictionary_filename, const std::string& recognizer_model)
11 |   {
12 |     if(detector_model == "FasterRCNN"){
13 |       detector = new FasterRCNNTextDetector(detector_graph_filename);
14 |     }else{
15 |       LOG(ERROR) <<detector_model + " not implemented yet";
16 |     }
17 |     if(recognizer_model == "CTC"){
18 |       recognizer = new CTCSceneTextRecognizer(recognizer_graph_filename, dictionary_filename);
19 |     }else{
20 |       LOG(ERROR) <<recognizer_model + " not implemented yet";
21 |     }
22 |   }
23 |   
24 |   void SceneTextReader::extract_word_regions(cv::Mat& image, std::vector<TextBox>& boxes, std::vector<cv::Mat>& word_regions){
25 |     int num_word = boxes.size();
26 |     if(num_word == 0) return;
27 | 
28 |     for(int i=0; i<num_word; i++){
29 |       cv::Mat word_region;
30 |       float angle = get_angle(boxes[i]);
31 |       cv::Mat cropped;
32 |       std::vector<cv::Point> new_points;
33 |       get_cropped_extend_image(image, boxes[i], cropped, new_points);
34 |       
35 |       cv::Mat rotated;
36 |       std::vector<cv::Point> rotated_points;
37 |       rotate_image_and_points(cropped, new_points, angle, rotated, rotated_points);
38 |       word_regions.push_back(rotated);
39 |     }
40 |   }
41 | 
42 |   void SceneTextReader::read_text(cv::Mat& image, std::vector<TextBox>& res){ 
43 |     detector->run_graph(image, res);
44 |     std::cout<<"found "<<res.size()<<" number of text"<<std::endl; 
45 |     std::vector<cv::Mat> word_regions;
46 |     extract_word_regions(image, res, word_regions);
47 |     //preprocess all the images;
48 |     std::vector<cv::Mat> preprocessed_images = recognizer->preprocess_images(word_regions);
49 |     std::cout<<preprocessed_images[0].rows<<" "<<preprocessed_images[0].cols<<std::endl;
50 |     std::vector<string> output_texts = recognizer->run_graph(preprocessed_images);
51 |     for(int i=0; i<res.size(); i++){
52 |       res[i].set_text(output_texts[i]);
53 |     }
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/scene_text_reader.h:
--------------------------------------------------------------------------------
 1 | #ifndef Scene_Text_Reader_H
 2 | #define Scene_Text_Reader_H
 3 | 
 4 | #include <iostream>
 5 | #include <memory>
 6 | #include <string>
 7 | #include <vector>
 8 | #include <unordered_map>
 9 | #include <assert.h>
10 | 
11 | #include "tensorflow/core/framework/graph.pb.h"
12 | #include "tensorflow/core/framework/tensor_shape.pb.h"
13 | #include "tensorflow/core/framework/tensor.h"
14 | #include "tensorflow/core/graph/graph.h"
15 | #include "tensorflow/core/lib/strings/str_util.h"
16 | #include "tensorflow/core/platform/init_main.h"
17 | #include "tensorflow/core/platform/logging.h"
18 | #include "tensorflow/core/platform/platform.h"
19 | #include "tensorflow/core/platform/types.h"
20 | #include "tensorflow/core/public/session.h"
21 | 
22 | //opencv
23 | #include <opencv2/core.hpp>
24 | #include "opencv2/opencv.hpp"
25 | 
26 | #include "utils.h"
27 | //recognizer
28 | #include "ctc_scene_text_recognizer.h"
29 | #include "recognizer.h"
30 | //detector
31 | #include "faster_rcnn_text_detector.h"
32 | #include "detector.h"
33 | 
34 | #include "text_box.h"
35 | 
36 | using namespace tensorflow;
37 | 
38 | namespace scene_text_reader{
39 | 
40 |   class SceneTextReader{
41 |     public:
42 |       SceneTextReader();
43 | 
44 |       SceneTextReader(const std::string&, const std::string&, const std::string&,
45 |                       const std::string& detector_model=std::string("FasterRCNN"),
46 |                       const std::string& recognizer_model=std::string("CTC"));
47 |     
48 |       void read_text(cv::Mat&, std::vector<TextBox>& res);
49 | 
50 |       void extract_word_regions(cv::Mat& image,
51 |           std::vector<TextBox>& boxes, std::vector<cv::Mat>& word_regions);
52 |     
53 |     private:
54 |       Detector *detector;
55 |       Recognizer *recognizer; 
56 |   };
57 | 
58 | }
59 | #endif 
60 | 


--------------------------------------------------------------------------------
/test_images/img_108.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/test_images/img_108.jpg


--------------------------------------------------------------------------------
/test_images/word_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dafanghe/DeepSceneTextReader/ade507dd988a18ce3fbb12f2f518bac6b6b683fa/test_images/word_2.png


--------------------------------------------------------------------------------
/text_box.cpp:
--------------------------------------------------------------------------------
 1 | #include "text_box.h"
 2 | 
 3 | TextBox::TextBox(std::vector<cv::Point>& points, std::string text){
 4 |   this->points = points;
 5 |   this->text = text;
 6 | }
 7 | 
 8 | void TextBox::get_rectangle_box(cv::Point& p1, cv::Point& p2){
 9 |   int minx = 100000, miny = 100000, maxx = 0, maxy = 0;
10 |   for(auto const& value: this->points){
11 |     minx = std::min(minx, value.x);
12 |     miny = std::min(miny, value.y);
13 |     maxx = std::max(maxx, value.x);
14 |     maxy = std::max(maxy, value.y);
15 |   }
16 |   p1.x = minx;
17 |   p1.y = miny;
18 |   p2.x = maxx;
19 |   p2.y = maxy;
20 | }
21 | 
22 | std::ostream &operator<<(std::ostream &os, TextBox &m) { 
23 |   std::vector<cv::Point> points = m.get_points();
24 |   os<<"oriented box: ";
25 |   for(int i = 0; i < points.size(); i++){
26 |     os<<points[i]<<" ";
27 |   }
28 |   os<<" text: "<<m.get_text()<<std::endl;
29 |   return os;
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/text_box.h:
--------------------------------------------------------------------------------
 1 | #ifndef Text_Box_H
 2 | #define Text_Box_H
 3 | 
 4 | #include <iostream>
 5 | #include <memory>
 6 | #include <string>
 7 | #include <vector>
 8 | 
 9 | //opencv
10 | #include <opencv2/core.hpp>
11 | #include "opencv2/opencv.hpp"
12 | 
13 | class TextBox{
14 |   public:
15 |     TextBox(std::vector<cv::Point>& points, std::string text);
16 |     void get_rectangle_box(cv::Point& p1, cv::Point& p2);
17 |     std::vector<cv::Point>& get_points() {return points;}
18 |     std::string& get_text() {return text;}
19 |     float& get_score() {return score;}
20 |     void set_text(std::string s) {text=s;}
21 |   private:
22 |     std::vector<cv::Point> points;
23 |     std::string text;
24 |     float score;
25 | };
26 | 
27 | std::ostream &operator<<(std::ostream &os, TextBox &m); 
28 | 
29 | #endif 
30 | 


--------------------------------------------------------------------------------
/utils.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils.h"
  2 | 
  3 | #define PI 3.14159265358979323846
  4 | 
  5 | void resize_image_max_len(const cv::Mat& image, cv::Mat& resized_image, float& ratio_h, float& ratio_w, int max_side_len){
  6 |   int height = image.rows;
  7 |   int width = image.cols;
  8 |   float ratio = 1;
  9 |   if(std::max(height, width) > max_side_len)
 10 |     ratio = height > width ? float(max_side_len)/height: float(max_side_len)/width;
 11 |   int resize_h = int(height * ratio);
 12 |   int resize_w = int(width * ratio);
 13 |   resize_h = resize_h%32 == 0? resize_h : (resize_h/32 - 1) * 32;
 14 |   resize_w = resize_w%32 == 0? resize_w : (resize_w/32 - 1) * 32;
 15 |   cv::resize(image, resized_image, cv::Size(resize_w, resize_h));
 16 |   
 17 |   ratio_h = float(resize_h)/height;
 18 |   ratio_w = float(resize_w)/width;
 19 | }
 20 | 
 21 | void resize_image_fix_height(const cv::Mat& image, cv::Mat& resized_image, float& ratio, int fixed_height){
 22 |   int height = image.rows;
 23 |   int width = image.cols;
 24 |   ratio = float(fixed_height)/height;
 25 |   int resize_h = fixed_height;
 26 |   int resize_w = int(width * ratio);
 27 |   cv::resize(image, resized_image, cv::Size(resize_w, resize_h));  
 28 | }
 29 | 
 30 | void pad_image_width(const cv::Mat& image, cv::Mat& padded_image, int target_width){
 31 |   int height = image.rows;
 32 |   int width = image.cols;
 33 |   int borderType = cv::BORDER_CONSTANT;
 34 |   if(width > target_width)
 35 |     cv::resize(image, padded_image, cv::Size(target_width, height));
 36 |   else if(width < target_width){
 37 |     int pad_len = target_width - width;
 38 |     copyMakeBorder(image, padded_image, 0, 0, 0, pad_len, borderType, cv::Scalar(0,0,0));
 39 |   }else
 40 |     padded_image = image.clone();
 41 | }
 42 | 
 43 | tensorflow::Tensor cv_mat_to_tensor(const cv::Mat& image){
 44 |   int height = image.rows;
 45 |   int width = image.cols;
 46 |   int depth = 3;
 47 |   tensorflow::Tensor res_tensor(tensorflow::DT_UINT8, tensorflow::TensorShape({1, height, width, 3}));
 48 | 
 49 |   //we assume that the image is unsigned char dtype
 50 |   const unsigned char *source_data = (unsigned char*)(image.data); 
 51 | 
 52 |   auto tensor_mapped = res_tensor.tensor<unsigned char, 4>();
 53 |   for (int y = 0; y < height; ++y) {
 54 |     for (int x = 0; x < width; ++x) {
 55 |       auto b = source_data[image.step * y + x * image.channels()];
 56 |       auto g = source_data[image.step * y + x * image.channels()+1];
 57 |       auto r = source_data[image.step * y + x * image.channels()+2];
 58 |       tensor_mapped(0, y, x, 0) = r;
 59 |       tensor_mapped(0, y, x, 1) = g;
 60 |       tensor_mapped(0, y, x, 2) = b;
 61 |     }
 62 |   }
 63 |   return res_tensor;
 64 | }
 65 | 
 66 | cv::Mat tensor_to_cv_mat(const tensorflow::Tensor tensor){
 67 |   auto tensor_data = tensor.flat<float>();
 68 |   //assume it is a 4d tensor
 69 |   auto tensor_shape = tensor.shape();
 70 |   int height = tensor_shape.dim_size(1);
 71 |   int width = tensor_shape.dim_size(2);
 72 |   std::cout<<" height "<<height << " width "<< width<<std::endl;
 73 | 
 74 |   cv::Mat res_mat = cv::Mat(height, width, CV_32FC1, cv::Scalar(0));
 75 |   float *res_data = (float*)(res_mat.data); 
 76 |   float min_val=100000, max_val=0;
 77 |   //(TODO) is there any other ways to copy the data into tensor?
 78 |   for (int y = 0; y < height; ++y) {
 79 |     for (int x = 0; x < width; ++x) {
 80 |       res_data[width*y+x] = float(tensor_data(y*width+x)) * 255;
 81 |       min_val = std::min(min_val, tensor_data(y*width+x));
 82 |       max_val = std::max(max_val, tensor_data(y*width+x));
 83 |     }
 84 |   }
 85 |   std::cout<<"min max tensor value: "<<min_val<<" "<<max_val<<std::endl;
 86 |   return res_mat;
 87 | }
 88 | 
 89 | float get_angle(TextBox& text_box){
 90 |   std::vector<cv::Point> points = text_box.get_points();
 91 |   float offset_y = points[1].y - points[0].y;
 92 |   float offset_x = points[1].x - points[0].x;
 93 |   return atan2(offset_y, offset_x);  
 94 | }
 95 | 
 96 | void get_cropped_extend_image(cv::Mat& image, TextBox& box, cv::Mat& cropped, std::vector<cv::Point>& new_points){
 97 |   cv::Point p1, p2;
 98 |   box.get_rectangle_box(p1, p2);
 99 |   int height = p2.y - p1.y;
100 |   int width = p1.y - p1.x;
101 | 
102 |   int extend_len = std::max(height, width);
103 |   int minx = std::max(0, p1.x - extend_len);
104 |   int miny = std::max(0, p1.y - extend_len);
105 |   int maxx = std::min(image.cols, p2.x + extend_len);
106 |   int maxy = std::min(image.rows, p2.y + extend_len);
107 |   
108 |   std::vector<cv::Point> points = box.get_points();
109 |   new_points.resize(points.size());
110 |   for(int i=0; i<points.size(); i++){
111 |     new_points[i].x = points[i].x - minx;
112 |     new_points[i].y = points[i].y - miny;
113 |   }
114 |   
115 |   cv::Rect roi(minx, miny, maxx - minx, maxy - miny);
116 |   cropped = image(roi);
117 | }
118 | 
119 | cv::Point rotate_point(cv::Point& point, float angle, cv::Point& center){
120 |   float new_x = (point.x - center.x) * cos(angle) - (point.y - center.y) * sin(angle) + center.x;
121 |   float new_y = (point.x - center.x) * sin(angle) + (point.y - center.y) * cos(angle) + center.y;
122 |   return cv::Point(new_x, new_y);
123 | }
124 | 
125 | void rotate_image_and_points(cv::Mat& cropped, std::vector<cv::Point>& points,
126 |       float angle, cv::Mat& rotated_image, std::vector<cv::Point>& rotated_points){
127 |   int height = cropped.rows, width = cropped.cols;
128 |   cv::Point center(width/2, height/2);
129 |   int min_side = std::min(height, width);
130 |   auto M = cv::getRotationMatrix2D(center, angle * 180./PI, 1.0);
131 |   cv::warpAffine(cropped, rotated_image, M, cv::Size(cropped.cols*2, cropped.rows*2));
132 | 
133 |   //rotate the images
134 |   rotated_points.resize(points.size()); 
135 |   for(int i=0; i<rotated_points.size(); i++){
136 |     rotated_points[i] = rotate_point(points[i], -angle, center);
137 |   }
138 |   //draw_polygon(rotated_image, rotated_points);
139 |   //cv::imwrite("test.jpg", rotated_image);
140 |   
141 |   //crop the word image. It contains some background.
142 |   float extend_ratio_x = 0.05;
143 |   float extend_ratio_y = 0.1;
144 |   int minx = 10000, miny = 10000, maxx = 0, maxy = 0;
145 | 
146 |   for(auto &point: rotated_points){
147 |     minx = std::min(minx, point.x); 
148 |     miny = std::min(miny, point.y); 
149 |     maxx = std::max(maxx, point.x);
150 |     maxy = std::max(maxy, point.y);
151 |   }
152 |   
153 |   minx = std::max(minx - int(extend_ratio_x * min_side), 0);
154 |   miny = std::max(miny - int(extend_ratio_y * min_side), 0);
155 |   maxx = std::min(maxx + int(extend_ratio_x * min_side), rotated_image.cols);
156 |   maxy = std::min(maxy + int(extend_ratio_y * min_side), rotated_image.rows);
157 | 
158 |   //crop it
159 |   rotated_image = rotated_image(cv::Rect(minx, miny, maxx-minx, maxy-miny));
160 |   for(auto & ele: rotated_points){
161 |     ele.x -= minx;
162 |     ele.y -= miny;
163 |   }
164 |   
165 |   //draw_polygon(rotated_image, rotated_points);
166 | }
167 | 
168 | void draw_polygon(cv::Mat& image, std::vector<cv::Point>& points){
169 |   std::vector<cv::Scalar> colors={cv::Scalar(0,0,255), cv::Scalar(0,255,0),
170 |     cv::Scalar(255,0,0), cv::Scalar(255,255,0), cv::Scalar(0,255,255), cv::Scalar(255,0,255)};
171 |   for(int j=0; j<4; j++){
172 |     cv::line(image, points[j], points[(j+1)%4], colors[j%4], 3); 
173 |   }
174 | }
175 | 
176 | void draw_text_box(cv::Mat& image, TextBox& text_box){
177 |   //draw the polygon
178 |   std::vector<cv::Scalar> colors={cv::Scalar(0,0,255), cv::Scalar(0,255,0),
179 |     cv::Scalar(255,0,0), cv::Scalar(255,255,0), cv::Scalar(0,255,255), cv::Scalar(255,0,255)};
180 |   draw_polygon(image, text_box.get_points());
181 |   //draw text above the left up corner
182 |   cv::Point p1, p2;
183 |   text_box.get_rectangle_box(p1, p2);
184 |   cv::Point draw_loc(std::max(0, p1.x - 10), std::max(0, p1.y - 10));
185 |   cv::putText(image, text_box.get_text(), draw_loc, cv::FONT_HERSHEY_PLAIN, 1.3,  cv::Scalar(0,255,255));
186 | }
187 | 


--------------------------------------------------------------------------------
/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef Scene_Text_Utils_H
 2 | #define Scene_Text_Utils_H
 3 | 
 4 | #include "tensorflow/core/framework/tensor_shape.pb.h"
 5 | #include "tensorflow/core/framework/tensor.h"
 6 | #include <math.h>
 7 | 
 8 | //opencv
 9 | #include <opencv2/core.hpp>
10 | #include "opencv2/opencv.hpp"
11 | #include "text_box.h"
12 | #include <iostream>
13 | 
14 | void resize_image_max_len(const cv::Mat& image, cv::Mat& resized_image, float& ratio_h, float& ratio_w, int max_side_len=800);
15 | 
16 | void resize_image_fix_height(const cv::Mat& image, cv::Mat& resized_image, float& ratio, int fixed_height=32);
17 | 
18 | void pad_image_width(const cv::Mat& image, cv::Mat& padded_image, int target_width=128);
19 | 
20 | tensorflow::Tensor cv_mat_to_tensor(const cv::Mat& image);
21 | 
22 | cv::Mat tensor_to_cv_mat(const tensorflow::Tensor tensor);
23 | 
24 | float get_angle(TextBox& text_box);
25 | 
26 | void get_cropped_extend_image(cv::Mat& image, TextBox& box, cv::Mat& cropped, std::vector<cv::Point>& new_points);
27 | 
28 | void rotate_image_and_points(cv::Mat& cropped, std::vector<cv::Point>& points, float angle, cv::Mat& rotated_image, std::vector<cv::Point>& rotated_points);
29 | 
30 | cv::Point rotate_point(cv::Point& point, float angle, cv::Point& center);
31 | 
32 | void draw_polygon(cv::Mat& image, std::vector<cv::Point>& points);
33 | 
34 | void draw_text_box(cv::Mat& image, TextBox& text_box);
35 | #endif 
36 | 


--------------------------------------------------------------------------------