├── .gitignore
├── .gitmodules
├── README.md
├── datasets
    ├── coco-text
    │   └── .gitignore
    ├── hust-tr400
    │   └── .gitignore
    ├── icdar-2013
    │   └── .gitignore
    ├── icdar-2015
    │   └── .gitignore
    ├── msra-td500
    │   └── .gitignore
    ├── script
    │   ├── ann2voc2007.m
    │   ├── ann2voc2007.sh
    │   ├── fetch_dataset.sh
    │   ├── format_annotation.py
    │   └── rm_headline.sh
    ├── svt
    │   └── .gitignore
    └── test
    │   ├── img_1.jpg
    │   ├── img_5.jpg
    │   └── img_6.jpg
├── models
    └── deploy.prototxt
└── script
    └── text_detect_demo.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | **.o
2 | **.tar
3 | **.tar.gz
4 | **.zip
5 | **.jpg
6 | **.json
7 | *.pyc
8 | **.xml
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "py-faster-rcnn"]
 2 | 	path = py-faster-rcnn
 3 | 	url = https://github.com/jugg1024/py-faster-rcnn.git
 4 | [submodule "datasets/coco-text/coco-text"]
 5 | 	path = datasets/coco-text/coco-text
 6 | 	url = https://github.com/jugg1024/coco-text.git
 7 | [submodule "datasets/coco-text/coco-text-tool"]
 8 | 	path = datasets/coco-text/coco-text-tool
 9 | 	url = https://github.com/jugg1024/coco-text.git
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ### Text Detection Using [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn/blob/master/README.md).
  2 | 
  3 | # image #
  4 | 
  5 | ### Introduction
  6 | 
  7 | This repository is aimed at provide an example of training text-detection models using *faster-rcnn*
  8 | 
  9 | ### Download repo 
 10 | 
 11 |   + Clone the repository
 12 |   
 13 |   ```Shell
 14 |   # Make sure to clone with --recursive
 15 |   git clone --recursive https://github.com/jugg1024/Text-Detection-with-FRCN.git
 16 |   ```
 17 | 
 18 | ### Compile
 19 | 
 20 |   + Compile py-faster-rcnn
 21 | 
 22 |   2.1 change the branch of py-faster-rcnn to text-detection-demo
 23 | ```Shell
 24 | 	cd $Text-Detection-with-FRCN/py-faster-rcnn
 25 |     	git checkout text-detection
 26 | ```
 27 | 
 28 |   2.2 Build Caffe and pycaffe.
 29 | 
 30 | ```Shell
 31 | # ensure your enviroment support the training of caffeensure your enviroment support the training of caffe
 32 | cd $Text-Detection-with-FRCN/py-faster-rcnn/caffe-fast-rcnn
 33 | cp Makefile.config.example Makefile.config
 34 | # adjust the Makefile.config
 35 | make -j16 && make pycaffe    # here only python api is used.
 36 | # test if caffe python api is ok.
 37 | cd python
 38 | python
 39 | >>> import caffe
 40 | >>> caffe.__version__
 41 | '1.0.0-rc3'
 42 | ```
 43 | 
 44 |   2.3 Build the Cython modules.
 45 | 
 46 | ```Shell
 47 | cd $Text-Detection-with-FRCN/py-faster-rcnn/lib
 48 | make
 49 | ```
 50 | 	
 51 | ### Run demo
 52 | 
 53 |   + Run text detection demo
 54 | 
 55 |   3.1 download pre-trained model
 56 | 
 57 | 	URL: http://pan.baidu.com/s/1dE2Ori5  Extract Code: phxk
 58 | 	
 59 | 	LINK FROM HUBIC: https://ovh.to/SivaG2
 60 | ```Shell
 61 | ln -s $DOWNLOAD_MODEL_PATH $Text-Detection-with-FRCN/model/vgg16_faster_rcnn_fine_tune_on_coco.caffemodel
 62 | ```
 63 |   3.2 run demo
 64 | 
 65 | ```Shell
 66 | cd $Text-Detection-with-FRCN/
 67 | ./script/text_detect_demo.sh
 68 | ```
 69 | 	Results are on output_img
 70 | 
 71 | 
 72 | ### Further
 73 | 
 74 |   if you think the model is not ok, then you can trainning with your own dataset, take coco-text for example.
 75 |   
 76 |   + training 
 77 | 
 78 |   4.1 download coco-text dataset
 79 | 
 80 | ```Shell
 81 | cd $Text-Detection-with-FRCN/datasets/script
 82 | ./fetch_dataset.sh coco-text
 83 | # download it takes long!
 84 | # ensure you have both data and label
 85 | # for coco-text label is in COCO-text.json, and data is in train2014.zip
 86 | ```
 87 | 
 88 |   4.2 download pre-train model
 89 | 
 90 | ```Shell
 91 | # finetune on this model, you can also use one model you train before
 92 | cd $Text-Detection-with-FRCN/py-faster-rcnn
 93 | ./data/scripts/fetch_imagenet_models.sh
 94 | # download it takes long!
 95 | ```
 96 | 
 97 |   4.3 format the data(you should write your code here)
 98 | 
 99 | ```Shell
100 | # format the raw image and label into the type of pascal_voc
101 | # follow the code in $Text-Detection-with-FRCN/datasets/script/format_annotation.py
102 | cd $Text-Detection-with-FRCN/datasets/script
103 | ./format_annotation.py --dataset coco-text
104 | ```
105 | 	
106 |   4.4 create a softlink the formatted data to working directorry
107 |        
108 | ```Shell
109 | # link your data folder to train_data
110 | cd $Text-Detection-with-FRCN/datasets/
111 | ln -s train_data coco-text    # $YOUR_DATA
112 | ```       
113 |         
114 |   4.5 training
115 |       
116 | ```Shell
117 | cd $Text-Detection-with-FRCN/py-faster-rcnn/
118 | ./experiments/scripts/faster_rcnn_end2end.sh 0 VGG16 pascal_voc
119 | ```
120 | 


--------------------------------------------------------------------------------
/datasets/coco-text/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | !coco-text-tool
6 | 


--------------------------------------------------------------------------------
/datasets/hust-tr400/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/datasets/icdar-2013/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/datasets/icdar-2015/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/datasets/msra-td500/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/datasets/script/ann2voc2007.m:
--------------------------------------------------------------------------------
  1 | function ann2voc2007(input_dir)
  2 |   curpath = mfilename('fullpath');
  3 |   [pathstr,~,~] = fileparts(curpath)
  4 |   if input_dir(end) == '/'
  5 |     input_dir = input_dir(1:end-1);
  6 |   end
  7 |   [~,input_dir,~] = fileparts(input_dir);
  8 |   input_dir = [pathstr '/../' input_dir '/formatted_dataset']
  9 |   imgpath = [input_dir '/JPEGImages/']
 10 |   txtpath = [input_dir '/images.annotations']
 11 |   xmlpath_new = [input_dir '/Annotations/'];
 12 |   foldername = 'VOC2007';
 13 |   coco = containers.Map();
 14 |   fidin = fopen(txtpath, 'r');
 15 |   cnt = 0;
 16 |   while ~feof(fidin)
 17 |     tline = fgetl(fidin);
 18 |     str = regexp(tline, ' ', 'split');
 19 |     xmlname = strrep(str{1},'.jpg','.xml');
 20 |     info = imfinfo([imgpath '/' str{1}]);
 21 |     str{3} = max(str2double(str{3}), 1);
 22 |     str{4} = max(str2double(str{4}), 1);
 23 |     str{5} = min(str2double(str{5}), info.Width);
 24 |     str{6} = min(str2double(str{6}), info.Height);
 25 |     if str{3} >= str{5} || str{4} >= str{6} || str{3} <= 0 || str{4} <= 0 || str{5} > info.Width...
 26 |       str{6} > info.Height
 27 |       continue;
 28 |     end
 29 |     cnt = cnt + 1
 30 |     if exist([imgpath '/' str{1}])
 31 |       if isKey(coco,xmlname)
 32 |         Createnode = coco(xmlname);
 33 |         object_node = Createnode.createElement('object');
 34 |         Root = Createnode.getDocumentElement;
 35 |         Root.appendChild(object_node);
 36 |         node=Createnode.createElement('name');
 37 |         node.appendChild(Createnode.createTextNode(str{2}));
 38 |         object_node.appendChild(node);
 39 |         node=Createnode.createElement('pose');
 40 |         node.appendChild(Createnode.createTextNode('Unspecified'));
 41 |         object_node.appendChild(node);
 42 |         node=Createnode.createElement('truncated');
 43 |         node.appendChild(Createnode.createTextNode('0'));
 44 |         object_node.appendChild(node);
 45 |         node=Createnode.createElement('difficult');
 46 |         node.appendChild(Createnode.createTextNode('0'));
 47 |         object_node.appendChild(node);
 48 |         bndbox_node=Createnode.createElement('bndbox');
 49 |         object_node.appendChild(bndbox_node);
 50 |         node=Createnode.createElement('xmin');
 51 |         node.appendChild(Createnode.createTextNode(num2str(str{3})));
 52 |         bndbox_node.appendChild(node);
 53 |         node=Createnode.createElement('ymin');
 54 |         node.appendChild(Createnode.createTextNode(num2str(str{4})));
 55 |         bndbox_node.appendChild(node);
 56 |         node=Createnode.createElement('xmax');
 57 |         node.appendChild(Createnode.createTextNode(num2str(str{5})));
 58 |         bndbox_node.appendChild(node);
 59 |         node=Createnode.createElement('ymax');
 60 |         node.appendChild(Createnode.createTextNode(num2str(str{6})));
 61 |         bndbox_node.appendChild(node);
 62 |       else
 63 |         Createnode = com.mathworks.xml.XMLUtils.createDocument('annotation');
 64 |         Root = Createnode.getDocumentElement;
 65 |         node = Createnode.createElement('folder');
 66 |         node.appendChild(Createnode.createTextNode(foldername));
 67 |         Root.appendChild(node);
 68 |         node = Createnode.createElement('filename');
 69 |         node.appendChild(Createnode.createTextNode(str{1}));
 70 |         Root.appendChild(node);
 71 |         source_node = Createnode.createElement('source');
 72 |         Root.appendChild(source_node);
 73 |         node = Createnode.createElement('database');
 74 |         node.appendChild(Createnode.createTextNode('MS COCO-Text'));
 75 |         source_node.appendChild(node);
 76 |         node = Createnode.createElement('annotation');
 77 |         node.appendChild(Createnode.createTextNode('MS COCO-Text 2014'));
 78 |         source_node.appendChild(node);
 79 |         node=Createnode.createElement('image');
 80 |         node.appendChild(Createnode.createTextNode('NULL'));
 81 |         source_node.appendChild(node);
 82 |         node=Createnode.createElement('flickrid');
 83 |         node.appendChild(Createnode.createTextNode('NULL'));
 84 |         source_node.appendChild(node);
 85 |         owner_node=Createnode.createElement('owner');
 86 |         Root.appendChild(owner_node);
 87 |         node=Createnode.createElement('flickrid');
 88 |         node.appendChild(Createnode.createTextNode('NULL'));
 89 |         owner_node.appendChild(node);
 90 |         node=Createnode.createElement('name');
 91 |         node.appendChild(Createnode.createTextNode('ligen'));
 92 |         owner_node.appendChild(node);
 93 |         size_node=Createnode.createElement('size');
 94 |         Root.appendChild(size_node);
 95 |         node=Createnode.createElement('width');
 96 |         node.appendChild(Createnode.createTextNode(num2str(info.Width)));
 97 |         size_node.appendChild(node);
 98 |         node=Createnode.createElement('height');
 99 |         node.appendChild(Createnode.createTextNode(num2str(info.Height)));
100 |         size_node.appendChild(node);
101 |         node=Createnode.createElement('depth');
102 |         node.appendChild(Createnode.createTextNode(num2str(info.BitDepth / 8)));
103 |         size_node.appendChild(node);
104 |         node=Createnode.createElement('segmented');
105 |         node.appendChild(Createnode.createTextNode('0'));
106 |         Root.appendChild(node);
107 |         object_node=Createnode.createElement('object');
108 |         Root.appendChild(object_node);
109 |         node=Createnode.createElement('name');
110 |         node.appendChild(Createnode.createTextNode(str{2}));
111 |         object_node.appendChild(node);
112 |         node=Createnode.createElement('pose');
113 |         node.appendChild(Createnode.createTextNode('Unspecified'));
114 |         object_node.appendChild(node);
115 |         node=Createnode.createElement('truncated');
116 |         node.appendChild(Createnode.createTextNode('0'));
117 |         object_node.appendChild(node);
118 |         node=Createnode.createElement('difficult');
119 |         node.appendChild(Createnode.createTextNode('0'));
120 |         object_node.appendChild(node);
121 |         bndbox_node=Createnode.createElement('bndbox');
122 |         object_node.appendChild(bndbox_node);
123 |         node=Createnode.createElement('xmin');
124 |         node.appendChild(Createnode.createTextNode(num2str(str{3})));
125 |         bndbox_node.appendChild(node);
126 |         node=Createnode.createElement('ymin');
127 |         node.appendChild(Createnode.createTextNode(num2str(str{4})));
128 |         bndbox_node.appendChild(node);
129 |         node=Createnode.createElement('xmax');
130 |         node.appendChild(Createnode.createTextNode(num2str(str{5})));
131 |         bndbox_node.appendChild(node);
132 |         node=Createnode.createElement('ymax');
133 |         node.appendChild(Createnode.createTextNode(num2str(str{6})));
134 |         bndbox_node.appendChild(node);
135 |         coco(xmlname) = Createnode;
136 |       end
137 |     end
138 |   end
139 |   fclose(fidin);
140 |   keyss = keys(coco);
141 |   for i = 1:length(keyss)
142 |     xmlwrite([xmlpath_new '/' keyss{i}], coco(keyss{i}));
143 |   end
144 | end
145 | 


--------------------------------------------------------------------------------
/datasets/script/ann2voc2007.sh:
--------------------------------------------------------------------------------
1 | matlab -nodisplay -nodesktop -r "ann2voc2007('$1'); quit"
2 | 


--------------------------------------------------------------------------------
/datasets/script/fetch_dataset.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # example Usage: ./fetch_dataset.sh coco-text
 4 | 
 5 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
 6 | cd $DIR
 7 | 
 8 | #checking md5sum
 9 | checkmd5sum() {
10 |   os=`uname -s`
11 |   if [ "$os" = "Linux" ]; then
12 |     checksum=`md5sum $DDIR/$FILE | awk '{ print $1 }'`
13 |   elif [ "$os" = "Darwin" ]; then
14 |     checksum=`cat $DDIR/$FILE | md5`
15 |   fi
16 |   if [ "$checksum" = "$CHECKSUM" ]; then
17 |     echo "Checksum is correct. No need to download."
18 |     DOWNLOAD="no"
19 |   else
20 |     echo "Checksum is incorrect. Need to download again."
21 |     DOWNLOAD="yes"
22 |   fi
23 | }
24 | 
25 | #checking file exist
26 | checkfile() {
27 |   if [ -f $DDIR/$FILE ]; then
28 |     echo "File already exists."
29 |     if [ "$CHECKSUM" = "nocheck" ]; then
30 |       echo "File is too large. No check is applyed."
31 |       DOWNLOAD="no"
32 |     else
33 |       echo "Checking md5..."
34 |       checkmd5sum
35 |     fi
36 |   fi
37 | }
38 | 
39 | #download and unzip file
40 | download_file() {
41 |   DDIR=$1
42 |   FILE=$2
43 |   URL=$3
44 |   CHECKSUM=$4
45 |   TYPE=$5
46 |   DOWNLOAD="yes"
47 |   checkfile
48 |   if [ "$DOWNLOAD" = "yes" ]; then
49 |     echo "Downloading $FILE..."
50 |     wget $URL -O $DDIR/$FILE
51 |       echo "Unzipping..."
52 |     cd $DDIR
53 |     if [ "$TYPE" = "zip" ]; then
54 |       unzip $FILE
55 |     elif [ "$TYPE" = "tar" ]; then
56 |       tar zxvf $FILE
57 |     fi
58 |     cd ..    
59 |     echo "Done. Please run this command again to verify that checksum = $CHECKSUM."
60 |   fi
61 | }
62 | 
63 | if [ "$1" = "coco-text" ]; then
64 |   download_file $1 COCO_Text.zip https://s3.amazonaws.com/cocotext/COCO_Text.zip 5cecfc1081b2ae7fdea75e6c9a9dec3b zip
65 |   download_file $1 train2014.zip http://msvocds.blob.core.windows.net/coco2014/train2014.zip nocheck zip
66 | elif [ "$1" = "hust-tr400" ]; then
67 |   download_file $1 HUST-TR400.zip http://mc.eistar.net/UpLoadFiles/dataset/HUST-TR400.zip f11d974da7f39c7d09addb750baa4e1a zip
68 | fi


--------------------------------------------------------------------------------
/datasets/script/format_annotation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import os
  4 | import pprint
  5 | import sys
  6 | import time
  7 | from os.path import isfile, join
  8 | from os import listdir
  9 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../coco-text/coco-text-tool'))
 10 | import coco_text
 11 | 
 12 | def generate_xml_from_annotation():
 13 |   # to be implement
 14 |   print 'to be implement'
 15 | 
 16 | def format_coco_text():
 17 |   print 'format coco_text dataset: 80percent training, 10percent valing, 10percent testing'
 18 |   # read annotations
 19 |   # in : annotate_id imagename    bbox(xmin,ymin,w,h);
 20 |   # out: imgprefix label(text)    bbox1(xmin,ymin,xmax,ymax)
 21 |   #      imgprefix label(text)    bbox2
 22 |   #  import the annotations of coco-text
 23 |   if not os.path.exists('train2014'):
 24 |     print 'train2014/ not found, please unzipping'
 25 |     return -1;
 26 |   if not os.path.exists('COCO_Text.json'):
 27 |     print 'COCO_Text.json not found, please unzipping'
 28 |     return -1;
 29 |     
 30 |   train_file = open('formatted_dataset/ImageSets/Main/train.txt','w')
 31 |   trainval_file = open('formatted_dataset/ImageSets/Main/trainval.txt','w')
 32 |   test_file = open('formatted_dataset/ImageSets/Main/test.txt','w')
 33 |   val_file = open('formatted_dataset/ImageSets/Main/val.txt','w')
 34 | 
 35 |   annotation_in = coco_text.COCO_Text('COCO_Text.json')
 36 |   annotation_out = open('formatted_dataset/images.annotations', 'w')
 37 | 
 38 |   # select training image
 39 |   ann_ids = annotation_in.getAnnIds(imgIds=annotation_in.train, 
 40 |       catIds=[('legibility','legible'),('class','machine printed')])
 41 |   print 'train annotations:' + str(len(ann_ids))  
 42 |   anns = annotation_in.loadAnns(ann_ids)
 43 |   imgid_set = set()
 44 |   for ann in anns: 
 45 |     im_id_str = str(ann['image_id'])
 46 |     imgprefix = im_id_str
 47 |     for i in xrange(0, 12 - len(im_id_str)):
 48 |       imgprefix = '0' + imgprefix
 49 |     imgprefix = 'COCO_train2014_' + imgprefix
 50 |     img_name = imgprefix + '.jpg'
 51 |     # images.annotations
 52 |     bbox = ann['bbox']
 53 |     xmin = int(round(bbox[0]))
 54 |     ymin = int(round(bbox[1]))
 55 |     xmax = int(round(bbox[0] + bbox[2]))
 56 |     ymax = int(round(bbox[1] + bbox[3]))
 57 |     annotation_out.write(img_name + ' text ' + str(xmin) + ' ' + str(ymin) + ' ' + str(xmax) + ' ' + str(ymax) + '\n')
 58 |     if not ann['image_id'] in imgid_set:
 59 |       # ImageSets train
 60 |       train_file.write(imgprefix + '\n')
 61 |       trainval_file.write(imgprefix + '\n')
 62 |       # JPEGImages train
 63 |       if not os.path.isfile('formatted_dataset/JPEGImages/' + img_name): 
 64 |         os.system('mv train2014/' + img_name + ' formatted_dataset/JPEGImages')
 65 |     imgid_set.add(ann['image_id'])
 66 | 
 67 |   # select valing and testing image
 68 |   ann_ids = annotation_in.getAnnIds(imgIds=annotation_in.val, 
 69 |       catIds=[('legibility','legible'),('class','machine printed')])
 70 |   print 'val annotations:' + str(len(ann_ids))  
 71 |   anns = annotation_in.loadAnns(ann_ids)
 72 |   imgid_set = set()
 73 |   cnt = 0
 74 |   for ann in anns:
 75 |     cnt += 1
 76 |     im_id_str = str(ann['image_id'])
 77 |     imgprefix = im_id_str
 78 |     for i in xrange(0, 12 - len(im_id_str)):
 79 |       imgprefix = '0' + imgprefix
 80 |     imgprefix = 'COCO_train2014_' + imgprefix
 81 |     img_name = imgprefix + '.jpg'
 82 |     # images.annotations
 83 |     bbox = ann['bbox']
 84 |     xmin = int(round(bbox[0]))
 85 |     ymin = int(round(bbox[1]))
 86 |     xmax = int(round(bbox[0] + bbox[2]))
 87 |     ymax = int(round(bbox[1] + bbox[3]))
 88 |     annotation_out.write(img_name + ' text ' + str(xmin) + ' ' + str(ymin) + ' ' + str(xmax) + ' ' + str(ymax) + '\n')
 89 |     if not ann['image_id'] in imgid_set:
 90 |       # ImageSets train or test
 91 |       if cnt % 4 == 1:
 92 |         test_file.write(imgprefix + '\n')
 93 |       else:
 94 |         val_file.write(imgprefix + '\n')
 95 |         trainval_file.write(imgprefix + '\n')
 96 |       # JPEGImages val or test
 97 |       if not os.path.isfile('formatted_dataset/JPEGImages/' + img_name): 
 98 |         os.system('mv train2014/' + img_name + ' formatted_dataset/JPEGImages')
 99 |     imgid_set.add(ann['image_id'])
100 | 
101 | 
102 | 
103 | 
104 | def format_byted_chi():
105 |   print 'format byted_chi dataset: 80percent training, 10percent valing, 10percent testing'
106 |   # read annotations
107 |   # in : imgpath                  bbox1(xmin,ymin,xmax,ymax);bbox2;bbox3
108 |   # out: imgprefix label(text)    bbox1(xmin,ymin,xmax,ymax)
109 |   #      imgprefix label(text)    bbox2
110 |   if not os.path.exists('chinese_text_detection'):
111 |     print 'chinese_text_detection/ not found, please unzipping'
112 |     return -1;
113 | 
114 |   annotation_in = open('chinese_text_detection/image_to_rois.txt', 'r')
115 |   annotation_out = open('formatted_dataset/images.annotations', 'w')
116 |   cnt = 0
117 |   for line in annotation_in:
118 |     strs = line.split()
119 |     assert len(strs) == 2, 'Not regular byted_chi line'
120 |     image_path = strs[0]                   # the first item
121 |     image_name = image_path.split('/')[-1] # the last item
122 |     new_img_name = 'byted_chi_' + str(cnt) + '.jpg';
123 |     # JPEGImages
124 |     os.system('mv chinese_text_detection/' + image_path 
125 |       + ' formatted_dataset/JPEGImages/' + new_img_name)
126 |     bboxes = strs[1].split(';')
127 |     # images.annotations
128 |     for bbox in bboxes:
129 |       box = bbox.split(',')
130 |       assert len(box) == 4, 'Not regular byted_chi bbox'
131 |       annotation_out.write(new_img_name + ' text ' + 
132 |         box[0] + ' ' + box[1] + ' ' + box[2] + ' ' + box[3] + '\n')
133 |     cnt += 1
134 |   folder_num = 5 # cross validation of folder_num folds
135 |   for fold in xrange(0, folder_num):
136 |     folder_dir = 'formatted_dataset/ImageSets/folder_num_' + str(fold)
137 |     # ImageSets
138 |     if not os.path.exists(folder_dir):
139 |       os.makedirs(folder_dir)
140 |     train_file = open(folder_dir + '/train.txt','w')
141 |     trainval_file = open(folder_dir + '/trainval.txt','w')
142 |     test_file = open(folder_dir + '/test.txt','w')
143 |     val_file = open(folder_dir + '/val.txt','w')
144 |     if not os.path.exists(folder_dir):
145 |       os.makedirs(folder_dir)
146 |     annotation_in = open('chinese_text_detection/image_to_rois.txt', 'r')
147 |     cnt = 0
148 |     for line in annotation_in:
149 |       new_img_pre = 'byted_chi_' + str(cnt);
150 |       if cnt % (2 * folder_num) == fold:
151 |         test_file.write(new_img_pre + '\n')          
152 |       elif cnt % (2 * folder_num) == fold + 1:
153 |         val_file.write(new_img_pre + '\n')
154 |         trainval_file.write(new_img_pre + '\n')
155 |       else:
156 |         train_file.write(new_img_pre + '\n')
157 |         trainval_file.write(new_img_pre + '\n')
158 |       cnt += 1
159 | 
160 | def main(raw_args):
161 |   parser = argparse.ArgumentParser(
162 |   formatter_class = argparse.RawTextHelpFormatter,
163 |   description = 
164 |   '''
165 |   1. Format the dataset annotation into the form of pascal voc:
166 |   ./format_annotation.py --dataset byted-chi
167 |   ''')
168 |   # arguments command 1,2,3
169 |   parser.add_argument('--dataset',
170 |             choices = ['coco-text', 'byted-chi'],
171 |             help = 'Which dataset to format')
172 |   args = parser.parse_args(raw_args)
173 |   working_dir = os.path.join(os.path.dirname(__file__), '../' + args.dataset)
174 |   assert os.path.exists(working_dir), 'Not exists: ' + working_dir
175 |   assert os.path.isdir(working_dir), 'Not a dir: ' + working_dir
176 |   os.chdir(working_dir)
177 |   if not os.path.exists('formatted_dataset/Annotations'):
178 |     os.makedirs('formatted_dataset/Annotations')
179 |   if not os.path.exists('formatted_dataset/ImageSets/Main'):
180 |     os.makedirs('formatted_dataset/ImageSets/Main')
181 |   if not os.path.exists('formatted_dataset/JPEGImages'):
182 |     os.makedirs('formatted_dataset/JPEGImages')
183 |   print 'formating ' + args.dataset
184 |   if args.dataset == "byted-chi":
185 |     print 'remove chinese_text_detection'
186 |     os.system('rm -rf chinese_text_detection')
187 |     print 'unzip chinese_text_detection'
188 |     os.system('tar zxf chinese_text_detection.tar.gz')
189 |     print 'formating ...'
190 |     format_byted_chi()
191 |     os.system('rm -rf formatted_dataset/ImageSets/Main/')
192 |     os.system('ln -s folder_num_0/ formatted_dataset/ImageSets/Main')
193 |   elif args.dataset == "coco-text":
194 |     print 'remove COCO_Text.json'
195 |     os.system('rm COCO_Text.json')
196 |     print 'remove train2014'
197 |     os.system('rm -rf train2014')
198 |     print 'unzip COCO_Text.zip'
199 |     os.system('unzip COCO_Text.zip')
200 |     print 'unzip train2014.zip'
201 |     os.system('unzip train2014.zip')
202 |     print 'formating ...'
203 |     format_coco_text()
204 |   else:
205 |     print "not support dataset, to be implemented"
206 |   os.chdir('../script/')
207 |   os.system('./ann2voc2007.sh ' + args.dataset)
208 |   os.system('./rm_headline.sh ../' + args.dataset)
209 | 
210 | if __name__ == "__main__":
211 |   main(sys.argv[1:])
212 | 


--------------------------------------------------------------------------------
/datasets/script/rm_headline.sh:
--------------------------------------------------------------------------------
1 | for entry in $1/formatted_dataset/Annotations/*
2 | do
3 |    tail -n +2 "$entry" > "$entry.tmp" && mv "$entry.tmp" "$entry"
4 | done
5 | 
6 | 


--------------------------------------------------------------------------------
/datasets/svt/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/datasets/test/img_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jugg1024/Text-Detection-with-FRCN/e54749f111f26f712d0f4d55e99c675c3ad79312/datasets/test/img_1.jpg


--------------------------------------------------------------------------------
/datasets/test/img_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jugg1024/Text-Detection-with-FRCN/e54749f111f26f712d0f4d55e99c675c3ad79312/datasets/test/img_5.jpg


--------------------------------------------------------------------------------
/datasets/test/img_6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jugg1024/Text-Detection-with-FRCN/e54749f111f26f712d0f4d55e99c675c3ad79312/datasets/test/img_6.jpg


--------------------------------------------------------------------------------
/models/deploy.prototxt:
--------------------------------------------------------------------------------
  1 | name: "VGG_ILSVRC_16_layers"
  2 | 
  3 | input: "data"
  4 | input_shape {
  5 |   dim: 1
  6 |   dim: 3
  7 |   dim: 224
  8 |   dim: 224
  9 | }
 10 | 
 11 | input: "im_info"
 12 | input_shape {
 13 |   dim: 1
 14 |   dim: 3
 15 | }
 16 | 
 17 | layer {
 18 |   name: "conv1_1"
 19 |   type: "Convolution"
 20 |   bottom: "data"
 21 |   top: "conv1_1"
 22 |   param {
 23 |     lr_mult: 0
 24 |     decay_mult: 0
 25 |   }
 26 |   param {
 27 |     lr_mult: 0
 28 |     decay_mult: 0
 29 |   }
 30 |   convolution_param {
 31 |     num_output: 64
 32 |     pad: 1
 33 |     kernel_size: 3
 34 |   }
 35 | }
 36 | layer {
 37 |   name: "relu1_1"
 38 |   type: "ReLU"
 39 |   bottom: "conv1_1"
 40 |   top: "conv1_1"
 41 | }
 42 | layer {
 43 |   name: "conv1_2"
 44 |   type: "Convolution"
 45 |   bottom: "conv1_1"
 46 |   top: "conv1_2"
 47 |   param {
 48 |     lr_mult: 0
 49 |     decay_mult: 0
 50 |   }
 51 |   param {
 52 |     lr_mult: 0
 53 |     decay_mult: 0
 54 |   }
 55 |   convolution_param {
 56 |     num_output: 64
 57 |     pad: 1
 58 |     kernel_size: 3
 59 |   }
 60 | }
 61 | layer {
 62 |   name: "relu1_2"
 63 |   type: "ReLU"
 64 |   bottom: "conv1_2"
 65 |   top: "conv1_2"
 66 | }
 67 | layer {
 68 |   name: "pool1"
 69 |   type: "Pooling"
 70 |   bottom: "conv1_2"
 71 |   top: "pool1"
 72 |   pooling_param {
 73 |     pool: MAX
 74 |     kernel_size: 2
 75 |     stride: 2
 76 |   }
 77 | }
 78 | layer {
 79 |   name: "conv2_1"
 80 |   type: "Convolution"
 81 |   bottom: "pool1"
 82 |   top: "conv2_1"
 83 |   param {
 84 |     lr_mult: 0
 85 |     decay_mult: 0
 86 |   }
 87 |   param {
 88 |     lr_mult: 0
 89 |     decay_mult: 0
 90 |   }
 91 |   convolution_param {
 92 |     num_output: 128
 93 |     pad: 1
 94 |     kernel_size: 3
 95 |   }
 96 | }
 97 | layer {
 98 |   name: "relu2_1"
 99 |   type: "ReLU"
100 |   bottom: "conv2_1"
101 |   top: "conv2_1"
102 | }
103 | layer {
104 |   name: "conv2_2"
105 |   type: "Convolution"
106 |   bottom: "conv2_1"
107 |   top: "conv2_2"
108 |   param {
109 |     lr_mult: 0
110 |     decay_mult: 0
111 |   }
112 |   param {
113 |     lr_mult: 0
114 |     decay_mult: 0
115 |   }
116 |   convolution_param {
117 |     num_output: 128
118 |     pad: 1
119 |     kernel_size: 3
120 |   }
121 | }
122 | layer {
123 |   name: "relu2_2"
124 |   type: "ReLU"
125 |   bottom: "conv2_2"
126 |   top: "conv2_2"
127 | }
128 | layer {
129 |   name: "pool2"
130 |   type: "Pooling"
131 |   bottom: "conv2_2"
132 |   top: "pool2"
133 |   pooling_param {
134 |     pool: MAX
135 |     kernel_size: 2
136 |     stride: 2
137 |   }
138 | }
139 | layer {
140 |   name: "conv3_1"
141 |   type: "Convolution"
142 |   bottom: "pool2"
143 |   top: "conv3_1"
144 |   param {
145 |     lr_mult: 1
146 |     decay_mult: 1
147 |   }
148 |   param {
149 |     lr_mult: 2
150 |     decay_mult: 0
151 |   }
152 |   convolution_param {
153 |     num_output: 256
154 |     pad: 1
155 |     kernel_size: 3
156 |   }
157 | }
158 | layer {
159 |   name: "relu3_1"
160 |   type: "ReLU"
161 |   bottom: "conv3_1"
162 |   top: "conv3_1"
163 | }
164 | layer {
165 |   name: "conv3_2"
166 |   type: "Convolution"
167 |   bottom: "conv3_1"
168 |   top: "conv3_2"
169 |   param {
170 |     lr_mult: 1
171 |     decay_mult: 1
172 |   }
173 |   param {
174 |     lr_mult: 2
175 |     decay_mult: 0
176 |   }
177 |   convolution_param {
178 |     num_output: 256
179 |     pad: 1
180 |     kernel_size: 3
181 |   }
182 | }
183 | layer {
184 |   name: "relu3_2"
185 |   type: "ReLU"
186 |   bottom: "conv3_2"
187 |   top: "conv3_2"
188 | }
189 | layer {
190 |   name: "conv3_3"
191 |   type: "Convolution"
192 |   bottom: "conv3_2"
193 |   top: "conv3_3"
194 |   param {
195 |     lr_mult: 1
196 |     decay_mult: 1
197 |   }
198 |   param {
199 |     lr_mult: 2
200 |     decay_mult: 0
201 |   }
202 |   convolution_param {
203 |     num_output: 256
204 |     pad: 1
205 |     kernel_size: 3
206 |   }
207 | }
208 | layer {
209 |   name: "relu3_3"
210 |   type: "ReLU"
211 |   bottom: "conv3_3"
212 |   top: "conv3_3"
213 | }
214 | layer {
215 |   name: "pool3"
216 |   type: "Pooling"
217 |   bottom: "conv3_3"
218 |   top: "pool3"
219 |   pooling_param {
220 |     pool: MAX
221 |     kernel_size: 2
222 |     stride: 2
223 |   }
224 | }
225 | layer {
226 |   name: "conv4_1"
227 |   type: "Convolution"
228 |   bottom: "pool3"
229 |   top: "conv4_1"
230 |   param {
231 |     lr_mult: 1
232 |     decay_mult: 1
233 |   }
234 |   param {
235 |     lr_mult: 2
236 |     decay_mult: 0
237 |   }
238 |   convolution_param {
239 |     num_output: 512
240 |     pad: 1
241 |     kernel_size: 3
242 |   }
243 | }
244 | layer {
245 |   name: "relu4_1"
246 |   type: "ReLU"
247 |   bottom: "conv4_1"
248 |   top: "conv4_1"
249 | }
250 | layer {
251 |   name: "conv4_2"
252 |   type: "Convolution"
253 |   bottom: "conv4_1"
254 |   top: "conv4_2"
255 |   param {
256 |     lr_mult: 1
257 |     decay_mult: 1
258 |   }
259 |   param {
260 |     lr_mult: 2
261 |     decay_mult: 0
262 |   }
263 |   convolution_param {
264 |     num_output: 512
265 |     pad: 1
266 |     kernel_size: 3
267 |   }
268 | }
269 | layer {
270 |   name: "relu4_2"
271 |   type: "ReLU"
272 |   bottom: "conv4_2"
273 |   top: "conv4_2"
274 | }
275 | layer {
276 |   name: "conv4_3"
277 |   type: "Convolution"
278 |   bottom: "conv4_2"
279 |   top: "conv4_3"
280 |   param {
281 |     lr_mult: 1
282 |     decay_mult: 1
283 |   }
284 |   param {
285 |     lr_mult: 2
286 |     decay_mult: 0
287 |   }
288 |   convolution_param {
289 |     num_output: 512
290 |     pad: 1
291 |     kernel_size: 3
292 |   }
293 | }
294 | layer {
295 |   name: "relu4_3"
296 |   type: "ReLU"
297 |   bottom: "conv4_3"
298 |   top: "conv4_3"
299 | }
300 | layer {
301 |   name: "pool4"
302 |   type: "Pooling"
303 |   bottom: "conv4_3"
304 |   top: "pool4"
305 |   pooling_param {
306 |     pool: MAX
307 |     kernel_size: 2
308 |     stride: 2
309 |   }
310 | }
311 | layer {
312 |   name: "conv5_1"
313 |   type: "Convolution"
314 |   bottom: "pool4"
315 |   top: "conv5_1"
316 |   param {
317 |     lr_mult: 1
318 |     decay_mult: 1
319 |   }
320 |   param {
321 |     lr_mult: 2
322 |     decay_mult: 0
323 |   }
324 |   convolution_param {
325 |     num_output: 512
326 |     pad: 1
327 |     kernel_size: 3
328 |   }
329 | }
330 | layer {
331 |   name: "relu5_1"
332 |   type: "ReLU"
333 |   bottom: "conv5_1"
334 |   top: "conv5_1"
335 | }
336 | layer {
337 |   name: "conv5_2"
338 |   type: "Convolution"
339 |   bottom: "conv5_1"
340 |   top: "conv5_2"
341 |   param {
342 |     lr_mult: 1
343 |     decay_mult: 1
344 |   }
345 |   param {
346 |     lr_mult: 2
347 |     decay_mult: 0
348 |   }
349 |   convolution_param {
350 |     num_output: 512
351 |     pad: 1
352 |     kernel_size: 3
353 |   }
354 | }
355 | layer {
356 |   name: "relu5_2"
357 |   type: "ReLU"
358 |   bottom: "conv5_2"
359 |   top: "conv5_2"
360 | }
361 | layer {
362 |   name: "conv5_3"
363 |   type: "Convolution"
364 |   bottom: "conv5_2"
365 |   top: "conv5_3"
366 |   param {
367 |     lr_mult: 1
368 |     decay_mult: 1
369 |   }
370 |   param {
371 |     lr_mult: 2
372 |     decay_mult: 0
373 |   }
374 |   convolution_param {
375 |     num_output: 512
376 |     pad: 1
377 |     kernel_size: 3
378 |   }
379 | }
380 | layer {
381 |   name: "relu5_3"
382 |   type: "ReLU"
383 |   bottom: "conv5_3"
384 |   top: "conv5_3"
385 | }
386 | 
387 | #========= RPN ============
388 | 
389 | layer {
390 |   name: "rpn_conv/3x3"
391 |   type: "Convolution"
392 |   bottom: "conv5_3"
393 |   top: "rpn/output"
394 |   param { lr_mult: 1.0 decay_mult: 1.0 }
395 |   param { lr_mult: 2.0 decay_mult: 0 }
396 |   convolution_param {
397 |     num_output: 512
398 |     kernel_size: 3 pad: 1 stride: 1
399 |     weight_filler { type: "gaussian" std: 0.01 }
400 |     bias_filler { type: "constant" value: 0 }
401 |   }
402 | }
403 | layer {
404 |   name: "rpn_relu/3x3"
405 |   type: "ReLU"
406 |   bottom: "rpn/output"
407 |   top: "rpn/output"
408 | }
409 | 
410 | layer {
411 |   name: "rpn_cls_score"
412 |   type: "Convolution"
413 |   bottom: "rpn/output"
414 |   top: "rpn_cls_score"
415 |   param { lr_mult: 1.0 decay_mult: 1.0 }
416 |   param { lr_mult: 2.0 decay_mult: 0 }
417 |   convolution_param {
418 |     num_output: 18   # 2(bg/fg) * 9(anchors)
419 |     kernel_size: 1 pad: 0 stride: 1
420 |     weight_filler { type: "gaussian" std: 0.01 }
421 |     bias_filler { type: "constant" value: 0 }
422 |   }
423 | }
424 | layer {
425 |   name: "rpn_bbox_pred"
426 |   type: "Convolution"
427 |   bottom: "rpn/output"
428 |   top: "rpn_bbox_pred"
429 |   param { lr_mult: 1.0 decay_mult: 1.0 }
430 |   param { lr_mult: 2.0 decay_mult: 0 }
431 |   convolution_param {
432 |     num_output: 36   # 4 * 9(anchors)
433 |     kernel_size: 1 pad: 0 stride: 1
434 |     weight_filler { type: "gaussian" std: 0.01 }
435 |     bias_filler { type: "constant" value: 0 }
436 |   }
437 | }
438 | layer {
439 |    bottom: "rpn_cls_score"
440 |    top: "rpn_cls_score_reshape"
441 |    name: "rpn_cls_score_reshape"
442 |    type: "Reshape"
443 |    reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
444 | }
445 | 
446 | #========= RoI Proposal ============
447 | 
448 | layer {
449 |   name: "rpn_cls_prob"
450 |   type: "Softmax"
451 |   bottom: "rpn_cls_score_reshape"
452 |   top: "rpn_cls_prob"
453 | }
454 | layer {
455 |   name: 'rpn_cls_prob_reshape'
456 |   type: 'Reshape'
457 |   bottom: 'rpn_cls_prob'
458 |   top: 'rpn_cls_prob_reshape'
459 |   reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
460 | }
461 | layer {
462 |   name: 'proposal'
463 |   type: 'Python'
464 |   bottom: 'rpn_cls_prob_reshape'
465 |   bottom: 'rpn_bbox_pred'
466 |   bottom: 'im_info'
467 |   top: 'rois'
468 |   python_param {
469 |     module: 'rpn.proposal_layer'
470 |     layer: 'ProposalLayer'
471 |     param_str: "'feat_stride': 16"
472 |   }
473 | }
474 | 
475 | #========= RCNN ============
476 | 
477 | layer {
478 |   name: "roi_pool5"
479 |   type: "ROIPooling"
480 |   bottom: "conv5_3"
481 |   bottom: "rois"
482 |   top: "pool5"
483 |   roi_pooling_param {
484 |     pooled_w: 7
485 |     pooled_h: 7
486 |     spatial_scale: 0.0625 # 1/16
487 |   }
488 | }
489 | layer {
490 |   name: "fc6"
491 |   type: "InnerProduct"
492 |   bottom: "pool5"
493 |   top: "fc6"
494 |   param {
495 |     lr_mult: 1
496 |     decay_mult: 1
497 |   }
498 |   param {
499 |     lr_mult: 2
500 |     decay_mult: 0
501 |   }
502 |   inner_product_param {
503 |     num_output: 4096
504 |   }
505 | }
506 | layer {
507 |   name: "relu6"
508 |   type: "ReLU"
509 |   bottom: "fc6"
510 |   top: "fc6"
511 | }
512 | layer {
513 |   name: "drop6"
514 |   type: "Dropout"
515 |   bottom: "fc6"
516 |   top: "fc6"
517 |   dropout_param {
518 |     dropout_ratio: 0.5
519 |   }
520 | }
521 | layer {
522 |   name: "fc7"
523 |   type: "InnerProduct"
524 |   bottom: "fc6"
525 |   top: "fc7"
526 |   param {
527 |     lr_mult: 1
528 |     decay_mult: 1
529 |   }
530 |   param {
531 |     lr_mult: 2
532 |     decay_mult: 0
533 |   }
534 |   inner_product_param {
535 |     num_output: 4096
536 |   }
537 | }
538 | layer {
539 |   name: "relu7"
540 |   type: "ReLU"
541 |   bottom: "fc7"
542 |   top: "fc7"
543 | }
544 | layer {
545 |   name: "drop7"
546 |   type: "Dropout"
547 |   bottom: "fc7"
548 |   top: "fc7"
549 |   dropout_param {
550 |     dropout_ratio: 0.5
551 |   }
552 | }
553 | layer {
554 |   name: "cls_score"
555 |   type: "InnerProduct"
556 |   bottom: "fc7"
557 |   top: "cls_score"
558 |   param {
559 |     lr_mult: 1
560 |     decay_mult: 1
561 |   }
562 |   param {
563 |     lr_mult: 2
564 |     decay_mult: 0
565 |   }
566 |   inner_product_param {
567 |     num_output: 2
568 |     weight_filler {
569 |       type: "gaussian"
570 |       std: 0.01
571 |     }
572 |     bias_filler {
573 |       type: "constant"
574 |       value: 0
575 |     }
576 |   }
577 | }
578 | layer {
579 |   name: "bbox_pred"
580 |   type: "InnerProduct"
581 |   bottom: "fc7"
582 |   top: "bbox_pred"
583 |   param {
584 |     lr_mult: 1
585 |     decay_mult: 1
586 |   }
587 |   param {
588 |     lr_mult: 2
589 |     decay_mult: 0
590 |   }
591 |   inner_product_param {
592 |     num_output: 8
593 |     weight_filler {
594 |       type: "gaussian"
595 |       std: 0.001
596 |     }
597 |     bias_filler {
598 |       type: "constant"
599 |       value: 0
600 |     }
601 |   }
602 | }
603 | layer {
604 |   name: "cls_prob"
605 |   type: "Softmax"
606 |   bottom: "cls_score"
607 |   top: "cls_prob"
608 | }
609 | 


--------------------------------------------------------------------------------
/script/text_detect_demo.sh:
--------------------------------------------------------------------------------
1 | ./py-faster-rcnn/tools/text_detect_demo.py \
2 | 	--gpu 0 \
3 | 	--net models/deploy.prototxt \
4 | 	--model models/vgg16_faster_rcnn_fine_tune_on_coco.caffemodel \
5 | 	--dataset datasets/test
6 | 


--------------------------------------------------------------------------------