├── .gitignore ├── .gitmodules ├── README.md ├── datasets ├── coco-text │ └── .gitignore ├── hust-tr400 │ └── .gitignore ├── icdar-2013 │ └── .gitignore ├── icdar-2015 │ └── .gitignore ├── msra-td500 │ └── .gitignore ├── script │ ├── ann2voc2007.m │ ├── ann2voc2007.sh │ ├── fetch_dataset.sh │ ├── format_annotation.py │ └── rm_headline.sh ├── svt │ └── .gitignore └── test │ ├── img_1.jpg │ ├── img_5.jpg │ └── img_6.jpg ├── models └── deploy.prototxt └── script └── text_detect_demo.sh /.gitignore: -------------------------------------------------------------------------------- 1 | **.o 2 | **.tar 3 | **.tar.gz 4 | **.zip 5 | **.jpg 6 | **.json 7 | *.pyc 8 | **.xml 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "py-faster-rcnn"] 2 | path = py-faster-rcnn 3 | url = https://github.com/jugg1024/py-faster-rcnn.git 4 | [submodule "datasets/coco-text/coco-text"] 5 | path = datasets/coco-text/coco-text 6 | url = https://github.com/jugg1024/coco-text.git 7 | [submodule "datasets/coco-text/coco-text-tool"] 8 | path = datasets/coco-text/coco-text-tool 9 | url = https://github.com/jugg1024/coco-text.git 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Text Detection Using [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn/blob/master/README.md). 2 | 3 | # image # 4 | 5 | ### Introduction 6 | 7 | This repository is aimed at provide an example of training text-detection models using *faster-rcnn* 8 | 9 | ### Download repo 10 | 11 | + Clone the repository 12 | 13 | ```Shell 14 | # Make sure to clone with --recursive 15 | git clone --recursive https://github.com/jugg1024/Text-Detection-with-FRCN.git 16 | ``` 17 | 18 | ### Compile 19 | 20 | + Compile py-faster-rcnn 21 | 22 | 2.1 change the branch of py-faster-rcnn to text-detection-demo 23 | ```Shell 24 | cd $Text-Detection-with-FRCN/py-faster-rcnn 25 | git checkout text-detection 26 | ``` 27 | 28 | 2.2 Build Caffe and pycaffe. 29 | 30 | ```Shell 31 | # ensure your enviroment support the training of caffeensure your enviroment support the training of caffe 32 | cd $Text-Detection-with-FRCN/py-faster-rcnn/caffe-fast-rcnn 33 | cp Makefile.config.example Makefile.config 34 | # adjust the Makefile.config 35 | make -j16 && make pycaffe # here only python api is used. 36 | # test if caffe python api is ok. 37 | cd python 38 | python 39 | >>> import caffe 40 | >>> caffe.__version__ 41 | '1.0.0-rc3' 42 | ``` 43 | 44 | 2.3 Build the Cython modules. 45 | 46 | ```Shell 47 | cd $Text-Detection-with-FRCN/py-faster-rcnn/lib 48 | make 49 | ``` 50 | 51 | ### Run demo 52 | 53 | + Run text detection demo 54 | 55 | 3.1 download pre-trained model 56 | 57 | URL: http://pan.baidu.com/s/1dE2Ori5 Extract Code: phxk 58 | 59 | LINK FROM HUBIC: https://ovh.to/SivaG2 60 | ```Shell 61 | ln -s $DOWNLOAD_MODEL_PATH $Text-Detection-with-FRCN/model/vgg16_faster_rcnn_fine_tune_on_coco.caffemodel 62 | ``` 63 | 3.2 run demo 64 | 65 | ```Shell 66 | cd $Text-Detection-with-FRCN/ 67 | ./script/text_detect_demo.sh 68 | ``` 69 | Results are on output_img 70 | 71 | 72 | ### Further 73 | 74 | if you think the model is not ok, then you can trainning with your own dataset, take coco-text for example. 75 | 76 | + training 77 | 78 | 4.1 download coco-text dataset 79 | 80 | ```Shell 81 | cd $Text-Detection-with-FRCN/datasets/script 82 | ./fetch_dataset.sh coco-text 83 | # download it takes long! 84 | # ensure you have both data and label 85 | # for coco-text label is in COCO-text.json, and data is in train2014.zip 86 | ``` 87 | 88 | 4.2 download pre-train model 89 | 90 | ```Shell 91 | # finetune on this model, you can also use one model you train before 92 | cd $Text-Detection-with-FRCN/py-faster-rcnn 93 | ./data/scripts/fetch_imagenet_models.sh 94 | # download it takes long! 95 | ``` 96 | 97 | 4.3 format the data(you should write your code here) 98 | 99 | ```Shell 100 | # format the raw image and label into the type of pascal_voc 101 | # follow the code in $Text-Detection-with-FRCN/datasets/script/format_annotation.py 102 | cd $Text-Detection-with-FRCN/datasets/script 103 | ./format_annotation.py --dataset coco-text 104 | ``` 105 | 106 | 4.4 create a softlink the formatted data to working directorry 107 | 108 | ```Shell 109 | # link your data folder to train_data 110 | cd $Text-Detection-with-FRCN/datasets/ 111 | ln -s train_data coco-text # $YOUR_DATA 112 | ``` 113 | 114 | 4.5 training 115 | 116 | ```Shell 117 | cd $Text-Detection-with-FRCN/py-faster-rcnn/ 118 | ./experiments/scripts/faster_rcnn_end2end.sh 0 VGG16 pascal_voc 119 | ``` 120 | -------------------------------------------------------------------------------- /datasets/coco-text/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | !coco-text-tool 6 | -------------------------------------------------------------------------------- /datasets/hust-tr400/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /datasets/icdar-2013/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /datasets/icdar-2015/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /datasets/msra-td500/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /datasets/script/ann2voc2007.m: -------------------------------------------------------------------------------- 1 | function ann2voc2007(input_dir) 2 | curpath = mfilename('fullpath'); 3 | [pathstr,~,~] = fileparts(curpath) 4 | if input_dir(end) == '/' 5 | input_dir = input_dir(1:end-1); 6 | end 7 | [~,input_dir,~] = fileparts(input_dir); 8 | input_dir = [pathstr '/../' input_dir '/formatted_dataset'] 9 | imgpath = [input_dir '/JPEGImages/'] 10 | txtpath = [input_dir '/images.annotations'] 11 | xmlpath_new = [input_dir '/Annotations/']; 12 | foldername = 'VOC2007'; 13 | coco = containers.Map(); 14 | fidin = fopen(txtpath, 'r'); 15 | cnt = 0; 16 | while ~feof(fidin) 17 | tline = fgetl(fidin); 18 | str = regexp(tline, ' ', 'split'); 19 | xmlname = strrep(str{1},'.jpg','.xml'); 20 | info = imfinfo([imgpath '/' str{1}]); 21 | str{3} = max(str2double(str{3}), 1); 22 | str{4} = max(str2double(str{4}), 1); 23 | str{5} = min(str2double(str{5}), info.Width); 24 | str{6} = min(str2double(str{6}), info.Height); 25 | if str{3} >= str{5} || str{4} >= str{6} || str{3} <= 0 || str{4} <= 0 || str{5} > info.Width... 26 | str{6} > info.Height 27 | continue; 28 | end 29 | cnt = cnt + 1 30 | if exist([imgpath '/' str{1}]) 31 | if isKey(coco,xmlname) 32 | Createnode = coco(xmlname); 33 | object_node = Createnode.createElement('object'); 34 | Root = Createnode.getDocumentElement; 35 | Root.appendChild(object_node); 36 | node=Createnode.createElement('name'); 37 | node.appendChild(Createnode.createTextNode(str{2})); 38 | object_node.appendChild(node); 39 | node=Createnode.createElement('pose'); 40 | node.appendChild(Createnode.createTextNode('Unspecified')); 41 | object_node.appendChild(node); 42 | node=Createnode.createElement('truncated'); 43 | node.appendChild(Createnode.createTextNode('0')); 44 | object_node.appendChild(node); 45 | node=Createnode.createElement('difficult'); 46 | node.appendChild(Createnode.createTextNode('0')); 47 | object_node.appendChild(node); 48 | bndbox_node=Createnode.createElement('bndbox'); 49 | object_node.appendChild(bndbox_node); 50 | node=Createnode.createElement('xmin'); 51 | node.appendChild(Createnode.createTextNode(num2str(str{3}))); 52 | bndbox_node.appendChild(node); 53 | node=Createnode.createElement('ymin'); 54 | node.appendChild(Createnode.createTextNode(num2str(str{4}))); 55 | bndbox_node.appendChild(node); 56 | node=Createnode.createElement('xmax'); 57 | node.appendChild(Createnode.createTextNode(num2str(str{5}))); 58 | bndbox_node.appendChild(node); 59 | node=Createnode.createElement('ymax'); 60 | node.appendChild(Createnode.createTextNode(num2str(str{6}))); 61 | bndbox_node.appendChild(node); 62 | else 63 | Createnode = com.mathworks.xml.XMLUtils.createDocument('annotation'); 64 | Root = Createnode.getDocumentElement; 65 | node = Createnode.createElement('folder'); 66 | node.appendChild(Createnode.createTextNode(foldername)); 67 | Root.appendChild(node); 68 | node = Createnode.createElement('filename'); 69 | node.appendChild(Createnode.createTextNode(str{1})); 70 | Root.appendChild(node); 71 | source_node = Createnode.createElement('source'); 72 | Root.appendChild(source_node); 73 | node = Createnode.createElement('database'); 74 | node.appendChild(Createnode.createTextNode('MS COCO-Text')); 75 | source_node.appendChild(node); 76 | node = Createnode.createElement('annotation'); 77 | node.appendChild(Createnode.createTextNode('MS COCO-Text 2014')); 78 | source_node.appendChild(node); 79 | node=Createnode.createElement('image'); 80 | node.appendChild(Createnode.createTextNode('NULL')); 81 | source_node.appendChild(node); 82 | node=Createnode.createElement('flickrid'); 83 | node.appendChild(Createnode.createTextNode('NULL')); 84 | source_node.appendChild(node); 85 | owner_node=Createnode.createElement('owner'); 86 | Root.appendChild(owner_node); 87 | node=Createnode.createElement('flickrid'); 88 | node.appendChild(Createnode.createTextNode('NULL')); 89 | owner_node.appendChild(node); 90 | node=Createnode.createElement('name'); 91 | node.appendChild(Createnode.createTextNode('ligen')); 92 | owner_node.appendChild(node); 93 | size_node=Createnode.createElement('size'); 94 | Root.appendChild(size_node); 95 | node=Createnode.createElement('width'); 96 | node.appendChild(Createnode.createTextNode(num2str(info.Width))); 97 | size_node.appendChild(node); 98 | node=Createnode.createElement('height'); 99 | node.appendChild(Createnode.createTextNode(num2str(info.Height))); 100 | size_node.appendChild(node); 101 | node=Createnode.createElement('depth'); 102 | node.appendChild(Createnode.createTextNode(num2str(info.BitDepth / 8))); 103 | size_node.appendChild(node); 104 | node=Createnode.createElement('segmented'); 105 | node.appendChild(Createnode.createTextNode('0')); 106 | Root.appendChild(node); 107 | object_node=Createnode.createElement('object'); 108 | Root.appendChild(object_node); 109 | node=Createnode.createElement('name'); 110 | node.appendChild(Createnode.createTextNode(str{2})); 111 | object_node.appendChild(node); 112 | node=Createnode.createElement('pose'); 113 | node.appendChild(Createnode.createTextNode('Unspecified')); 114 | object_node.appendChild(node); 115 | node=Createnode.createElement('truncated'); 116 | node.appendChild(Createnode.createTextNode('0')); 117 | object_node.appendChild(node); 118 | node=Createnode.createElement('difficult'); 119 | node.appendChild(Createnode.createTextNode('0')); 120 | object_node.appendChild(node); 121 | bndbox_node=Createnode.createElement('bndbox'); 122 | object_node.appendChild(bndbox_node); 123 | node=Createnode.createElement('xmin'); 124 | node.appendChild(Createnode.createTextNode(num2str(str{3}))); 125 | bndbox_node.appendChild(node); 126 | node=Createnode.createElement('ymin'); 127 | node.appendChild(Createnode.createTextNode(num2str(str{4}))); 128 | bndbox_node.appendChild(node); 129 | node=Createnode.createElement('xmax'); 130 | node.appendChild(Createnode.createTextNode(num2str(str{5}))); 131 | bndbox_node.appendChild(node); 132 | node=Createnode.createElement('ymax'); 133 | node.appendChild(Createnode.createTextNode(num2str(str{6}))); 134 | bndbox_node.appendChild(node); 135 | coco(xmlname) = Createnode; 136 | end 137 | end 138 | end 139 | fclose(fidin); 140 | keyss = keys(coco); 141 | for i = 1:length(keyss) 142 | xmlwrite([xmlpath_new '/' keyss{i}], coco(keyss{i})); 143 | end 144 | end 145 | -------------------------------------------------------------------------------- /datasets/script/ann2voc2007.sh: -------------------------------------------------------------------------------- 1 | matlab -nodisplay -nodesktop -r "ann2voc2007('$1'); quit" 2 | -------------------------------------------------------------------------------- /datasets/script/fetch_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # example Usage: ./fetch_dataset.sh coco-text 4 | 5 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )" 6 | cd $DIR 7 | 8 | #checking md5sum 9 | checkmd5sum() { 10 | os=`uname -s` 11 | if [ "$os" = "Linux" ]; then 12 | checksum=`md5sum $DDIR/$FILE | awk '{ print $1 }'` 13 | elif [ "$os" = "Darwin" ]; then 14 | checksum=`cat $DDIR/$FILE | md5` 15 | fi 16 | if [ "$checksum" = "$CHECKSUM" ]; then 17 | echo "Checksum is correct. No need to download." 18 | DOWNLOAD="no" 19 | else 20 | echo "Checksum is incorrect. Need to download again." 21 | DOWNLOAD="yes" 22 | fi 23 | } 24 | 25 | #checking file exist 26 | checkfile() { 27 | if [ -f $DDIR/$FILE ]; then 28 | echo "File already exists." 29 | if [ "$CHECKSUM" = "nocheck" ]; then 30 | echo "File is too large. No check is applyed." 31 | DOWNLOAD="no" 32 | else 33 | echo "Checking md5..." 34 | checkmd5sum 35 | fi 36 | fi 37 | } 38 | 39 | #download and unzip file 40 | download_file() { 41 | DDIR=$1 42 | FILE=$2 43 | URL=$3 44 | CHECKSUM=$4 45 | TYPE=$5 46 | DOWNLOAD="yes" 47 | checkfile 48 | if [ "$DOWNLOAD" = "yes" ]; then 49 | echo "Downloading $FILE..." 50 | wget $URL -O $DDIR/$FILE 51 | echo "Unzipping..." 52 | cd $DDIR 53 | if [ "$TYPE" = "zip" ]; then 54 | unzip $FILE 55 | elif [ "$TYPE" = "tar" ]; then 56 | tar zxvf $FILE 57 | fi 58 | cd .. 59 | echo "Done. Please run this command again to verify that checksum = $CHECKSUM." 60 | fi 61 | } 62 | 63 | if [ "$1" = "coco-text" ]; then 64 | download_file $1 COCO_Text.zip https://s3.amazonaws.com/cocotext/COCO_Text.zip 5cecfc1081b2ae7fdea75e6c9a9dec3b zip 65 | download_file $1 train2014.zip http://msvocds.blob.core.windows.net/coco2014/train2014.zip nocheck zip 66 | elif [ "$1" = "hust-tr400" ]; then 67 | download_file $1 HUST-TR400.zip http://mc.eistar.net/UpLoadFiles/dataset/HUST-TR400.zip f11d974da7f39c7d09addb750baa4e1a zip 68 | fi -------------------------------------------------------------------------------- /datasets/script/format_annotation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import os 4 | import pprint 5 | import sys 6 | import time 7 | from os.path import isfile, join 8 | from os import listdir 9 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../coco-text/coco-text-tool')) 10 | import coco_text 11 | 12 | def generate_xml_from_annotation(): 13 | # to be implement 14 | print 'to be implement' 15 | 16 | def format_coco_text(): 17 | print 'format coco_text dataset: 80percent training, 10percent valing, 10percent testing' 18 | # read annotations 19 | # in : annotate_id imagename bbox(xmin,ymin,w,h); 20 | # out: imgprefix label(text) bbox1(xmin,ymin,xmax,ymax) 21 | # imgprefix label(text) bbox2 22 | # import the annotations of coco-text 23 | if not os.path.exists('train2014'): 24 | print 'train2014/ not found, please unzipping' 25 | return -1; 26 | if not os.path.exists('COCO_Text.json'): 27 | print 'COCO_Text.json not found, please unzipping' 28 | return -1; 29 | 30 | train_file = open('formatted_dataset/ImageSets/Main/train.txt','w') 31 | trainval_file = open('formatted_dataset/ImageSets/Main/trainval.txt','w') 32 | test_file = open('formatted_dataset/ImageSets/Main/test.txt','w') 33 | val_file = open('formatted_dataset/ImageSets/Main/val.txt','w') 34 | 35 | annotation_in = coco_text.COCO_Text('COCO_Text.json') 36 | annotation_out = open('formatted_dataset/images.annotations', 'w') 37 | 38 | # select training image 39 | ann_ids = annotation_in.getAnnIds(imgIds=annotation_in.train, 40 | catIds=[('legibility','legible'),('class','machine printed')]) 41 | print 'train annotations:' + str(len(ann_ids)) 42 | anns = annotation_in.loadAnns(ann_ids) 43 | imgid_set = set() 44 | for ann in anns: 45 | im_id_str = str(ann['image_id']) 46 | imgprefix = im_id_str 47 | for i in xrange(0, 12 - len(im_id_str)): 48 | imgprefix = '0' + imgprefix 49 | imgprefix = 'COCO_train2014_' + imgprefix 50 | img_name = imgprefix + '.jpg' 51 | # images.annotations 52 | bbox = ann['bbox'] 53 | xmin = int(round(bbox[0])) 54 | ymin = int(round(bbox[1])) 55 | xmax = int(round(bbox[0] + bbox[2])) 56 | ymax = int(round(bbox[1] + bbox[3])) 57 | annotation_out.write(img_name + ' text ' + str(xmin) + ' ' + str(ymin) + ' ' + str(xmax) + ' ' + str(ymax) + '\n') 58 | if not ann['image_id'] in imgid_set: 59 | # ImageSets train 60 | train_file.write(imgprefix + '\n') 61 | trainval_file.write(imgprefix + '\n') 62 | # JPEGImages train 63 | if not os.path.isfile('formatted_dataset/JPEGImages/' + img_name): 64 | os.system('mv train2014/' + img_name + ' formatted_dataset/JPEGImages') 65 | imgid_set.add(ann['image_id']) 66 | 67 | # select valing and testing image 68 | ann_ids = annotation_in.getAnnIds(imgIds=annotation_in.val, 69 | catIds=[('legibility','legible'),('class','machine printed')]) 70 | print 'val annotations:' + str(len(ann_ids)) 71 | anns = annotation_in.loadAnns(ann_ids) 72 | imgid_set = set() 73 | cnt = 0 74 | for ann in anns: 75 | cnt += 1 76 | im_id_str = str(ann['image_id']) 77 | imgprefix = im_id_str 78 | for i in xrange(0, 12 - len(im_id_str)): 79 | imgprefix = '0' + imgprefix 80 | imgprefix = 'COCO_train2014_' + imgprefix 81 | img_name = imgprefix + '.jpg' 82 | # images.annotations 83 | bbox = ann['bbox'] 84 | xmin = int(round(bbox[0])) 85 | ymin = int(round(bbox[1])) 86 | xmax = int(round(bbox[0] + bbox[2])) 87 | ymax = int(round(bbox[1] + bbox[3])) 88 | annotation_out.write(img_name + ' text ' + str(xmin) + ' ' + str(ymin) + ' ' + str(xmax) + ' ' + str(ymax) + '\n') 89 | if not ann['image_id'] in imgid_set: 90 | # ImageSets train or test 91 | if cnt % 4 == 1: 92 | test_file.write(imgprefix + '\n') 93 | else: 94 | val_file.write(imgprefix + '\n') 95 | trainval_file.write(imgprefix + '\n') 96 | # JPEGImages val or test 97 | if not os.path.isfile('formatted_dataset/JPEGImages/' + img_name): 98 | os.system('mv train2014/' + img_name + ' formatted_dataset/JPEGImages') 99 | imgid_set.add(ann['image_id']) 100 | 101 | 102 | 103 | 104 | def format_byted_chi(): 105 | print 'format byted_chi dataset: 80percent training, 10percent valing, 10percent testing' 106 | # read annotations 107 | # in : imgpath bbox1(xmin,ymin,xmax,ymax);bbox2;bbox3 108 | # out: imgprefix label(text) bbox1(xmin,ymin,xmax,ymax) 109 | # imgprefix label(text) bbox2 110 | if not os.path.exists('chinese_text_detection'): 111 | print 'chinese_text_detection/ not found, please unzipping' 112 | return -1; 113 | 114 | annotation_in = open('chinese_text_detection/image_to_rois.txt', 'r') 115 | annotation_out = open('formatted_dataset/images.annotations', 'w') 116 | cnt = 0 117 | for line in annotation_in: 118 | strs = line.split() 119 | assert len(strs) == 2, 'Not regular byted_chi line' 120 | image_path = strs[0] # the first item 121 | image_name = image_path.split('/')[-1] # the last item 122 | new_img_name = 'byted_chi_' + str(cnt) + '.jpg'; 123 | # JPEGImages 124 | os.system('mv chinese_text_detection/' + image_path 125 | + ' formatted_dataset/JPEGImages/' + new_img_name) 126 | bboxes = strs[1].split(';') 127 | # images.annotations 128 | for bbox in bboxes: 129 | box = bbox.split(',') 130 | assert len(box) == 4, 'Not regular byted_chi bbox' 131 | annotation_out.write(new_img_name + ' text ' + 132 | box[0] + ' ' + box[1] + ' ' + box[2] + ' ' + box[3] + '\n') 133 | cnt += 1 134 | folder_num = 5 # cross validation of folder_num folds 135 | for fold in xrange(0, folder_num): 136 | folder_dir = 'formatted_dataset/ImageSets/folder_num_' + str(fold) 137 | # ImageSets 138 | if not os.path.exists(folder_dir): 139 | os.makedirs(folder_dir) 140 | train_file = open(folder_dir + '/train.txt','w') 141 | trainval_file = open(folder_dir + '/trainval.txt','w') 142 | test_file = open(folder_dir + '/test.txt','w') 143 | val_file = open(folder_dir + '/val.txt','w') 144 | if not os.path.exists(folder_dir): 145 | os.makedirs(folder_dir) 146 | annotation_in = open('chinese_text_detection/image_to_rois.txt', 'r') 147 | cnt = 0 148 | for line in annotation_in: 149 | new_img_pre = 'byted_chi_' + str(cnt); 150 | if cnt % (2 * folder_num) == fold: 151 | test_file.write(new_img_pre + '\n') 152 | elif cnt % (2 * folder_num) == fold + 1: 153 | val_file.write(new_img_pre + '\n') 154 | trainval_file.write(new_img_pre + '\n') 155 | else: 156 | train_file.write(new_img_pre + '\n') 157 | trainval_file.write(new_img_pre + '\n') 158 | cnt += 1 159 | 160 | def main(raw_args): 161 | parser = argparse.ArgumentParser( 162 | formatter_class = argparse.RawTextHelpFormatter, 163 | description = 164 | ''' 165 | 1. Format the dataset annotation into the form of pascal voc: 166 | ./format_annotation.py --dataset byted-chi 167 | ''') 168 | # arguments command 1,2,3 169 | parser.add_argument('--dataset', 170 | choices = ['coco-text', 'byted-chi'], 171 | help = 'Which dataset to format') 172 | args = parser.parse_args(raw_args) 173 | working_dir = os.path.join(os.path.dirname(__file__), '../' + args.dataset) 174 | assert os.path.exists(working_dir), 'Not exists: ' + working_dir 175 | assert os.path.isdir(working_dir), 'Not a dir: ' + working_dir 176 | os.chdir(working_dir) 177 | if not os.path.exists('formatted_dataset/Annotations'): 178 | os.makedirs('formatted_dataset/Annotations') 179 | if not os.path.exists('formatted_dataset/ImageSets/Main'): 180 | os.makedirs('formatted_dataset/ImageSets/Main') 181 | if not os.path.exists('formatted_dataset/JPEGImages'): 182 | os.makedirs('formatted_dataset/JPEGImages') 183 | print 'formating ' + args.dataset 184 | if args.dataset == "byted-chi": 185 | print 'remove chinese_text_detection' 186 | os.system('rm -rf chinese_text_detection') 187 | print 'unzip chinese_text_detection' 188 | os.system('tar zxf chinese_text_detection.tar.gz') 189 | print 'formating ...' 190 | format_byted_chi() 191 | os.system('rm -rf formatted_dataset/ImageSets/Main/') 192 | os.system('ln -s folder_num_0/ formatted_dataset/ImageSets/Main') 193 | elif args.dataset == "coco-text": 194 | print 'remove COCO_Text.json' 195 | os.system('rm COCO_Text.json') 196 | print 'remove train2014' 197 | os.system('rm -rf train2014') 198 | print 'unzip COCO_Text.zip' 199 | os.system('unzip COCO_Text.zip') 200 | print 'unzip train2014.zip' 201 | os.system('unzip train2014.zip') 202 | print 'formating ...' 203 | format_coco_text() 204 | else: 205 | print "not support dataset, to be implemented" 206 | os.chdir('../script/') 207 | os.system('./ann2voc2007.sh ' + args.dataset) 208 | os.system('./rm_headline.sh ../' + args.dataset) 209 | 210 | if __name__ == "__main__": 211 | main(sys.argv[1:]) 212 | -------------------------------------------------------------------------------- /datasets/script/rm_headline.sh: -------------------------------------------------------------------------------- 1 | for entry in $1/formatted_dataset/Annotations/* 2 | do 3 | tail -n +2 "$entry" > "$entry.tmp" && mv "$entry.tmp" "$entry" 4 | done 5 | 6 | -------------------------------------------------------------------------------- /datasets/svt/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /datasets/test/img_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jugg1024/Text-Detection-with-FRCN/e54749f111f26f712d0f4d55e99c675c3ad79312/datasets/test/img_1.jpg -------------------------------------------------------------------------------- /datasets/test/img_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jugg1024/Text-Detection-with-FRCN/e54749f111f26f712d0f4d55e99c675c3ad79312/datasets/test/img_5.jpg -------------------------------------------------------------------------------- /datasets/test/img_6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jugg1024/Text-Detection-with-FRCN/e54749f111f26f712d0f4d55e99c675c3ad79312/datasets/test/img_6.jpg -------------------------------------------------------------------------------- /models/deploy.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "im_info" 12 | input_shape { 13 | dim: 1 14 | dim: 3 15 | } 16 | 17 | layer { 18 | name: "conv1_1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1_1" 22 | param { 23 | lr_mult: 0 24 | decay_mult: 0 25 | } 26 | param { 27 | lr_mult: 0 28 | decay_mult: 0 29 | } 30 | convolution_param { 31 | num_output: 64 32 | pad: 1 33 | kernel_size: 3 34 | } 35 | } 36 | layer { 37 | name: "relu1_1" 38 | type: "ReLU" 39 | bottom: "conv1_1" 40 | top: "conv1_1" 41 | } 42 | layer { 43 | name: "conv1_2" 44 | type: "Convolution" 45 | bottom: "conv1_1" 46 | top: "conv1_2" 47 | param { 48 | lr_mult: 0 49 | decay_mult: 0 50 | } 51 | param { 52 | lr_mult: 0 53 | decay_mult: 0 54 | } 55 | convolution_param { 56 | num_output: 64 57 | pad: 1 58 | kernel_size: 3 59 | } 60 | } 61 | layer { 62 | name: "relu1_2" 63 | type: "ReLU" 64 | bottom: "conv1_2" 65 | top: "conv1_2" 66 | } 67 | layer { 68 | name: "pool1" 69 | type: "Pooling" 70 | bottom: "conv1_2" 71 | top: "pool1" 72 | pooling_param { 73 | pool: MAX 74 | kernel_size: 2 75 | stride: 2 76 | } 77 | } 78 | layer { 79 | name: "conv2_1" 80 | type: "Convolution" 81 | bottom: "pool1" 82 | top: "conv2_1" 83 | param { 84 | lr_mult: 0 85 | decay_mult: 0 86 | } 87 | param { 88 | lr_mult: 0 89 | decay_mult: 0 90 | } 91 | convolution_param { 92 | num_output: 128 93 | pad: 1 94 | kernel_size: 3 95 | } 96 | } 97 | layer { 98 | name: "relu2_1" 99 | type: "ReLU" 100 | bottom: "conv2_1" 101 | top: "conv2_1" 102 | } 103 | layer { 104 | name: "conv2_2" 105 | type: "Convolution" 106 | bottom: "conv2_1" 107 | top: "conv2_2" 108 | param { 109 | lr_mult: 0 110 | decay_mult: 0 111 | } 112 | param { 113 | lr_mult: 0 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 128 118 | pad: 1 119 | kernel_size: 3 120 | } 121 | } 122 | layer { 123 | name: "relu2_2" 124 | type: "ReLU" 125 | bottom: "conv2_2" 126 | top: "conv2_2" 127 | } 128 | layer { 129 | name: "pool2" 130 | type: "Pooling" 131 | bottom: "conv2_2" 132 | top: "pool2" 133 | pooling_param { 134 | pool: MAX 135 | kernel_size: 2 136 | stride: 2 137 | } 138 | } 139 | layer { 140 | name: "conv3_1" 141 | type: "Convolution" 142 | bottom: "pool2" 143 | top: "conv3_1" 144 | param { 145 | lr_mult: 1 146 | decay_mult: 1 147 | } 148 | param { 149 | lr_mult: 2 150 | decay_mult: 0 151 | } 152 | convolution_param { 153 | num_output: 256 154 | pad: 1 155 | kernel_size: 3 156 | } 157 | } 158 | layer { 159 | name: "relu3_1" 160 | type: "ReLU" 161 | bottom: "conv3_1" 162 | top: "conv3_1" 163 | } 164 | layer { 165 | name: "conv3_2" 166 | type: "Convolution" 167 | bottom: "conv3_1" 168 | top: "conv3_2" 169 | param { 170 | lr_mult: 1 171 | decay_mult: 1 172 | } 173 | param { 174 | lr_mult: 2 175 | decay_mult: 0 176 | } 177 | convolution_param { 178 | num_output: 256 179 | pad: 1 180 | kernel_size: 3 181 | } 182 | } 183 | layer { 184 | name: "relu3_2" 185 | type: "ReLU" 186 | bottom: "conv3_2" 187 | top: "conv3_2" 188 | } 189 | layer { 190 | name: "conv3_3" 191 | type: "Convolution" 192 | bottom: "conv3_2" 193 | top: "conv3_3" 194 | param { 195 | lr_mult: 1 196 | decay_mult: 1 197 | } 198 | param { 199 | lr_mult: 2 200 | decay_mult: 0 201 | } 202 | convolution_param { 203 | num_output: 256 204 | pad: 1 205 | kernel_size: 3 206 | } 207 | } 208 | layer { 209 | name: "relu3_3" 210 | type: "ReLU" 211 | bottom: "conv3_3" 212 | top: "conv3_3" 213 | } 214 | layer { 215 | name: "pool3" 216 | type: "Pooling" 217 | bottom: "conv3_3" 218 | top: "pool3" 219 | pooling_param { 220 | pool: MAX 221 | kernel_size: 2 222 | stride: 2 223 | } 224 | } 225 | layer { 226 | name: "conv4_1" 227 | type: "Convolution" 228 | bottom: "pool3" 229 | top: "conv4_1" 230 | param { 231 | lr_mult: 1 232 | decay_mult: 1 233 | } 234 | param { 235 | lr_mult: 2 236 | decay_mult: 0 237 | } 238 | convolution_param { 239 | num_output: 512 240 | pad: 1 241 | kernel_size: 3 242 | } 243 | } 244 | layer { 245 | name: "relu4_1" 246 | type: "ReLU" 247 | bottom: "conv4_1" 248 | top: "conv4_1" 249 | } 250 | layer { 251 | name: "conv4_2" 252 | type: "Convolution" 253 | bottom: "conv4_1" 254 | top: "conv4_2" 255 | param { 256 | lr_mult: 1 257 | decay_mult: 1 258 | } 259 | param { 260 | lr_mult: 2 261 | decay_mult: 0 262 | } 263 | convolution_param { 264 | num_output: 512 265 | pad: 1 266 | kernel_size: 3 267 | } 268 | } 269 | layer { 270 | name: "relu4_2" 271 | type: "ReLU" 272 | bottom: "conv4_2" 273 | top: "conv4_2" 274 | } 275 | layer { 276 | name: "conv4_3" 277 | type: "Convolution" 278 | bottom: "conv4_2" 279 | top: "conv4_3" 280 | param { 281 | lr_mult: 1 282 | decay_mult: 1 283 | } 284 | param { 285 | lr_mult: 2 286 | decay_mult: 0 287 | } 288 | convolution_param { 289 | num_output: 512 290 | pad: 1 291 | kernel_size: 3 292 | } 293 | } 294 | layer { 295 | name: "relu4_3" 296 | type: "ReLU" 297 | bottom: "conv4_3" 298 | top: "conv4_3" 299 | } 300 | layer { 301 | name: "pool4" 302 | type: "Pooling" 303 | bottom: "conv4_3" 304 | top: "pool4" 305 | pooling_param { 306 | pool: MAX 307 | kernel_size: 2 308 | stride: 2 309 | } 310 | } 311 | layer { 312 | name: "conv5_1" 313 | type: "Convolution" 314 | bottom: "pool4" 315 | top: "conv5_1" 316 | param { 317 | lr_mult: 1 318 | decay_mult: 1 319 | } 320 | param { 321 | lr_mult: 2 322 | decay_mult: 0 323 | } 324 | convolution_param { 325 | num_output: 512 326 | pad: 1 327 | kernel_size: 3 328 | } 329 | } 330 | layer { 331 | name: "relu5_1" 332 | type: "ReLU" 333 | bottom: "conv5_1" 334 | top: "conv5_1" 335 | } 336 | layer { 337 | name: "conv5_2" 338 | type: "Convolution" 339 | bottom: "conv5_1" 340 | top: "conv5_2" 341 | param { 342 | lr_mult: 1 343 | decay_mult: 1 344 | } 345 | param { 346 | lr_mult: 2 347 | decay_mult: 0 348 | } 349 | convolution_param { 350 | num_output: 512 351 | pad: 1 352 | kernel_size: 3 353 | } 354 | } 355 | layer { 356 | name: "relu5_2" 357 | type: "ReLU" 358 | bottom: "conv5_2" 359 | top: "conv5_2" 360 | } 361 | layer { 362 | name: "conv5_3" 363 | type: "Convolution" 364 | bottom: "conv5_2" 365 | top: "conv5_3" 366 | param { 367 | lr_mult: 1 368 | decay_mult: 1 369 | } 370 | param { 371 | lr_mult: 2 372 | decay_mult: 0 373 | } 374 | convolution_param { 375 | num_output: 512 376 | pad: 1 377 | kernel_size: 3 378 | } 379 | } 380 | layer { 381 | name: "relu5_3" 382 | type: "ReLU" 383 | bottom: "conv5_3" 384 | top: "conv5_3" 385 | } 386 | 387 | #========= RPN ============ 388 | 389 | layer { 390 | name: "rpn_conv/3x3" 391 | type: "Convolution" 392 | bottom: "conv5_3" 393 | top: "rpn/output" 394 | param { lr_mult: 1.0 decay_mult: 1.0 } 395 | param { lr_mult: 2.0 decay_mult: 0 } 396 | convolution_param { 397 | num_output: 512 398 | kernel_size: 3 pad: 1 stride: 1 399 | weight_filler { type: "gaussian" std: 0.01 } 400 | bias_filler { type: "constant" value: 0 } 401 | } 402 | } 403 | layer { 404 | name: "rpn_relu/3x3" 405 | type: "ReLU" 406 | bottom: "rpn/output" 407 | top: "rpn/output" 408 | } 409 | 410 | layer { 411 | name: "rpn_cls_score" 412 | type: "Convolution" 413 | bottom: "rpn/output" 414 | top: "rpn_cls_score" 415 | param { lr_mult: 1.0 decay_mult: 1.0 } 416 | param { lr_mult: 2.0 decay_mult: 0 } 417 | convolution_param { 418 | num_output: 18 # 2(bg/fg) * 9(anchors) 419 | kernel_size: 1 pad: 0 stride: 1 420 | weight_filler { type: "gaussian" std: 0.01 } 421 | bias_filler { type: "constant" value: 0 } 422 | } 423 | } 424 | layer { 425 | name: "rpn_bbox_pred" 426 | type: "Convolution" 427 | bottom: "rpn/output" 428 | top: "rpn_bbox_pred" 429 | param { lr_mult: 1.0 decay_mult: 1.0 } 430 | param { lr_mult: 2.0 decay_mult: 0 } 431 | convolution_param { 432 | num_output: 36 # 4 * 9(anchors) 433 | kernel_size: 1 pad: 0 stride: 1 434 | weight_filler { type: "gaussian" std: 0.01 } 435 | bias_filler { type: "constant" value: 0 } 436 | } 437 | } 438 | layer { 439 | bottom: "rpn_cls_score" 440 | top: "rpn_cls_score_reshape" 441 | name: "rpn_cls_score_reshape" 442 | type: "Reshape" 443 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 444 | } 445 | 446 | #========= RoI Proposal ============ 447 | 448 | layer { 449 | name: "rpn_cls_prob" 450 | type: "Softmax" 451 | bottom: "rpn_cls_score_reshape" 452 | top: "rpn_cls_prob" 453 | } 454 | layer { 455 | name: 'rpn_cls_prob_reshape' 456 | type: 'Reshape' 457 | bottom: 'rpn_cls_prob' 458 | top: 'rpn_cls_prob_reshape' 459 | reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } 460 | } 461 | layer { 462 | name: 'proposal' 463 | type: 'Python' 464 | bottom: 'rpn_cls_prob_reshape' 465 | bottom: 'rpn_bbox_pred' 466 | bottom: 'im_info' 467 | top: 'rois' 468 | python_param { 469 | module: 'rpn.proposal_layer' 470 | layer: 'ProposalLayer' 471 | param_str: "'feat_stride': 16" 472 | } 473 | } 474 | 475 | #========= RCNN ============ 476 | 477 | layer { 478 | name: "roi_pool5" 479 | type: "ROIPooling" 480 | bottom: "conv5_3" 481 | bottom: "rois" 482 | top: "pool5" 483 | roi_pooling_param { 484 | pooled_w: 7 485 | pooled_h: 7 486 | spatial_scale: 0.0625 # 1/16 487 | } 488 | } 489 | layer { 490 | name: "fc6" 491 | type: "InnerProduct" 492 | bottom: "pool5" 493 | top: "fc6" 494 | param { 495 | lr_mult: 1 496 | decay_mult: 1 497 | } 498 | param { 499 | lr_mult: 2 500 | decay_mult: 0 501 | } 502 | inner_product_param { 503 | num_output: 4096 504 | } 505 | } 506 | layer { 507 | name: "relu6" 508 | type: "ReLU" 509 | bottom: "fc6" 510 | top: "fc6" 511 | } 512 | layer { 513 | name: "drop6" 514 | type: "Dropout" 515 | bottom: "fc6" 516 | top: "fc6" 517 | dropout_param { 518 | dropout_ratio: 0.5 519 | } 520 | } 521 | layer { 522 | name: "fc7" 523 | type: "InnerProduct" 524 | bottom: "fc6" 525 | top: "fc7" 526 | param { 527 | lr_mult: 1 528 | decay_mult: 1 529 | } 530 | param { 531 | lr_mult: 2 532 | decay_mult: 0 533 | } 534 | inner_product_param { 535 | num_output: 4096 536 | } 537 | } 538 | layer { 539 | name: "relu7" 540 | type: "ReLU" 541 | bottom: "fc7" 542 | top: "fc7" 543 | } 544 | layer { 545 | name: "drop7" 546 | type: "Dropout" 547 | bottom: "fc7" 548 | top: "fc7" 549 | dropout_param { 550 | dropout_ratio: 0.5 551 | } 552 | } 553 | layer { 554 | name: "cls_score" 555 | type: "InnerProduct" 556 | bottom: "fc7" 557 | top: "cls_score" 558 | param { 559 | lr_mult: 1 560 | decay_mult: 1 561 | } 562 | param { 563 | lr_mult: 2 564 | decay_mult: 0 565 | } 566 | inner_product_param { 567 | num_output: 2 568 | weight_filler { 569 | type: "gaussian" 570 | std: 0.01 571 | } 572 | bias_filler { 573 | type: "constant" 574 | value: 0 575 | } 576 | } 577 | } 578 | layer { 579 | name: "bbox_pred" 580 | type: "InnerProduct" 581 | bottom: "fc7" 582 | top: "bbox_pred" 583 | param { 584 | lr_mult: 1 585 | decay_mult: 1 586 | } 587 | param { 588 | lr_mult: 2 589 | decay_mult: 0 590 | } 591 | inner_product_param { 592 | num_output: 8 593 | weight_filler { 594 | type: "gaussian" 595 | std: 0.001 596 | } 597 | bias_filler { 598 | type: "constant" 599 | value: 0 600 | } 601 | } 602 | } 603 | layer { 604 | name: "cls_prob" 605 | type: "Softmax" 606 | bottom: "cls_score" 607 | top: "cls_prob" 608 | } 609 | -------------------------------------------------------------------------------- /script/text_detect_demo.sh: -------------------------------------------------------------------------------- 1 | ./py-faster-rcnn/tools/text_detect_demo.py \ 2 | --gpu 0 \ 3 | --net models/deploy.prototxt \ 4 | --model models/vgg16_faster_rcnn_fine_tune_on_coco.caffemodel \ 5 | --dataset datasets/test 6 | --------------------------------------------------------------------------------