├── .gitignore
├── .gitmodules
├── .travis.yml
├── LICENSE
├── README.md
├── config_sample.py
├── find_bad.py
├── label_gen.py
├── label_image.py
├── paper.bib
├── paper.md
├── poster.pdf
├── predict.py
├── rate.py
├── render.py
├── requirements.txt
├── requirements_unfrozen.txt
├── screenshots
    ├── a_debug.png
    ├── a_label.png
    ├── debug.png
    ├── extracted.jpg
    ├── hep-th0401120-Figure-23-2x.png
    ├── hep-th0401120-Figure-23-label.png
    ├── hep-th0401120-Figure-23-prediction.png
    ├── hep-th0401120-Figure-23.png
    └── text-debug.png
├── testdata
    ├── figure.json
    ├── figure.png
    ├── figure2.json
    ├── figure2.png
    └── paper.pdf
└── testoutput
    ├── img
        ├── paper-Figure-0-2x.png
        ├── paper-Figure-0.png
        ├── paper-Figure-1-2x.png
        ├── paper-Figure-1.png
        ├── paper-Figure-2-2x.png
        ├── paper-Figure-2.png
        ├── paper-Figure-3-2x.png
        ├── paper-Figure-3.png
        ├── paper-Figure-4-2x.png
        ├── paper-Figure-4.png
        ├── paper-Figure-5-2x.png
        └── paper-Figure-5.png
    ├── json
        ├── paper-Figure-0.json
        ├── paper-Figure-1.json
        ├── paper-Figure-2.json
        ├── paper-Figure-3.json
        ├── paper-Figure-4.json
        └── paper-Figure-5.json
    └── text-masked
        ├── paper-Figure-0-dbg.png
        ├── paper-Figure-0-label.png
        ├── paper-Figure-1-dbg.png
        ├── paper-Figure-1-label.png
        ├── paper-Figure-2-dbg.png
        ├── paper-Figure-2-label.png
        ├── paper-Figure-3-dbg.png
        ├── paper-Figure-3-label.png
        ├── paper-Figure-4-dbg.png
        ├── paper-Figure-4-label.png
        ├── paper-Figure-5-dbg.png
        └── paper-Figure-5-label.png


/.gitignore:
--------------------------------------------------------------------------------
1 | config.py
2 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "pdffigures"]
2 | 	path = pdffigures
3 | 	url = https://github.com/allenai/pdffigures.git
4 | [submodule "darknet"]
5 | 	path = darknet
6 | 	url = https://github.com/domoritz/darknet.git
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: trusty
 2 | sudo: required
 3 | language: python
 4 | python:
 5 |   - "2.7"
 6 | virtualenv:
 7 |   system_site_packages: true
 8 | install:
 9 |   - sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
10 |   - sudo apt update
11 |   - sudo apt install -y libpoppler-dev libleptonica-dev g++-4.9
12 | 
13 |   - make -C pdffigures DEBUG=0 CC='g++-4.9 -std=c++11'
14 | 
15 |   - sudo apt install -y python-opencv python-numpy python-scipy python-matplotlib python-dev ghostscript libmagickwand-dev libfreetype6
16 |   - pip install -r requirements.txt
17 | 
18 |   - cp config_sample.py config.py
19 | script:
20 |   - python label_gen.py read testdata/paper.pdf /tmp/test --dbg-image
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Dominik Moritz
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Text detection in screen images with a Convolutional Neural Network [![theoj](http://joss.theoj.org/papers/d2821f933fc95337202393e84189f4d9/status.svg)](http://joss.theoj.org/papers/d2821f933fc95337202393e84189f4d9) [![Build Status](https://travis-ci.org/domoritz/label_generator.svg?branch=master)](https://travis-ci.org/domoritz/label_generator)
  2 | 
  3 | **Note: This was a class project where I wanted to learn about neural networks. If you want to do text detection in images, I suggest that you use something like [this approach](http://www.math.tau.ac.il/~turkel/imagepapers/text_detection.pdf).**
  4 | 
  5 | The repository contains a set of scripts to implement text detection from screen images. The idea is that we use a Convolutional Neural Network (CNN) to predict a heatmap of the probability of text in an image. But before we can predict anything, we need to train the network with a a set of pairs of images and training labels. We obtain the training data by extracting figures with embedded text from research papers.
  6 | 
  7 | **This is a very involved process and you may want to use the labels that I already generated (you are welcome). We have around 500K good labels extracted from around 1M papers from arXiv and the ACL anthology.**
  8 | 
  9 | PDF files, extracted figures and labels are in an S3 bucket at `s3://escience.washington.edu.viziometrics`. The PDF files for arXiv (extracted from [arXiv bulk access](http://arxiv.org/help/bulk_data_s3)) are in a separate bucket at `s3://arxiv-tars-pdfs`. The buckets have [requester pays](https://docs.aws.amazon.com/en_us/console/s3/requesterpaysbucket) enabled.
 10 | 
 11 | Please cite [the paper for this repo](https://www.theoj.org/joss-papers/joss.00235/10.21105.joss.00235.pdf) as
 12 | 
 13 | ```bib
 14 | @article{Moritz2017,
 15 |   doi = {10.21105/joss.00235},
 16 |   url = {https://doi.org/10.21105/joss.00235},
 17 |   year = {2017},
 18 |   month = jul,
 19 |   publisher = {The Open Journal},
 20 |   volume = {2},
 21 |   number = {15},
 22 |   pages = {235},
 23 |   author = {Dominik Moritz},
 24 |   title = {Text detection in screen images with a Convolutional Neural Network},
 25 |   journal = {The Journal of Open Source Software}
 26 | }
 27 | ```
 28 | 
 29 | ## Requirements
 30 | 
 31 | Install OpenCV with python support. Also install scipy, matplotlib, and numpy for python (either through pip or apt). Also install freetype, ghostscript, imagemagic, and tesseract. Please check the compatible versions of [pdffigures](https://github.com/allenai/pdffigures) with your OS.
 32 | 
 33 | ## Generate training data
 34 | 
 35 | You can run this locally or on a server. I tested every script locally on a mac without any problems. Below are instructions for Linux.
 36 | 
 37 | The scripts use [pdffigures](http://pdffigures.allenai.org/) to generate a JSON file that describes each figure in a paper.
 38 | 
 39 | ### AWS instructions
 40 | 
 41 | These are the steps I had to run to generate the training data an EC2 machines on AWS. The execution is embarrassingly parallel and thus runs reasonably fast (a few hours to a day or two for a million papers). At the time of writing, I ran this on Ubuntu 14.04, but later version may work as well with some small modifications.
 42 | 
 43 | The commands below are what I used to extract the images and generate the labels. As described above, you don't need to rerun this unless you want to use different papers than the ones I already extracted figures from (see above). If you want to run the code, you need to change the output S3 bucket to a bucket that you have write access to.
 44 | 
 45 | ```sh
 46 | # use tmux (maybe with attach)
 47 | tmux
 48 | 
 49 | sudo apt-get update
 50 | sudo apt-get install git python-pip python-opencv python-numpy python-scipy python-matplotlib ghostscript libmagickwand-dev libfreetype6 parallel
 51 | 
 52 | git clone https://github.com/domoritz/label_generator.git
 53 | cd label_generator
 54 | sudo pip install -r requirements.txt
 55 | git submodule init
 56 | git submodule update
 57 | 
 58 | sudo apt-get install libpoppler-dev libleptonica-dev pkg-config
 59 | 
 60 | # we need gcc 4.9
 61 | sudo add-apt-repository ppa:ubuntu-toolchain-r/test
 62 | sudo apt-get update
 63 | sudo apt-get install g++-4.9
 64 | 
 65 | # compile pdffigures
 66 | make -C pdffigures DEBUG=0 CC='g++-4.9 -std=c++11'
 67 | 
 68 | # at this point, you probably need to make a copy of the config file and update it
 69 | cp config_sample.py config.py
 70 | vim config.py
 71 | 
 72 | # test with one file
 73 | python label_gen.py read-s3 escience.washington.edu.viziometrics acl_anthology/pdf/C08-1099.pdf escience.washington.edu.viziometrics acl_anthology
 74 | 
 75 | # get list of documents to process
 76 | aws s3 --region=us-west-2 ls s3://escience.washington.edu.viziometrics/acl_anthology/pdf/ | awk '{ print $4 }' > acl_papers.txt
 77 | 
 78 | # now run for real
 79 | parallel --resume -j +6 --no-run-if-empty --eta --joblog /tmp/par.log python label_gen.py read-s3 escience.washington.edu.viziometrics acl_anthology/pdf/{} escience.washington.edu.viziometrics acl_anthology --dbg-image :::: acl_papers.txt
 80 | 
 81 | # monitor progress
 82 | tail -f /tmp/par.log
 83 | 
 84 | # find bad labels
 85 | python find_bad.py read-s3 escience.washington.edu.viziometrics acl_anthology/json > anthology_bad.txt
 86 | # you probably want to use this file to delete bad labels before you use it to train the CNN
 87 | # Use: parallel rm -f data/{}-label.png :::: anthology_bad.txt
 88 | 
 89 | # run find bad in parallel
 90 | seq {0,19} | parallel -j 20 --eta python find_bad.py read-s3 escience.washington.edu.viziometrics arxiv/json --chunk={} --of=20 '>' arxiv_bad_{}.txt
 91 | cat arxiv_bad_*.txt > arxiv_bad.txt
 92 | 
 93 | # at this point you may want to upload the file with bad labels back to S3
 94 | ```
 95 | 
 96 | ### FAQ for common error messages
 97 | 
 98 | These are some common errors I have experienced.
 99 | 
100 | **I don't see my output** Try `--debug` and make sure that you have the correct folders set up if you use S3.
101 | 
102 | **Failed to initialize libdc1394** `sudo ln /dev/null /dev/raw1394` https://stackoverflow.com/questions/12689304/ctypes-error-libdc1394-error-failed-to-initialize-libdc1394
103 | 
104 | **ImportError: MagickWand shared library not found.** See https://github.com/dahlia/wand/issues/141
105 | 
106 | ### Try the figure extraction
107 | 
108 | #### Local
109 | 
110 | `python label_gen.py read testdata/paper.pdf /tmp/test --dbg-image --debug`
111 | 
112 | #### With data from S3
113 | 
114 | `python label_gen.py read-s3 escience.washington.edu.viziometrics test/pdf/C08-1092.pdf test/ --dbg-image --debug`
115 | 
116 | 
117 | ## Train the neural network
118 | 
119 | I used a different machine for training the network because AWS doesn't have good graphics cards.
120 | 
121 | You can use any CNN to get the prediction but I use [pjreddie/darknet](https://github.com/pjreddie/darknet). My fork is at [domoritz/darknet](https://github.com/domoritz/darknet) and a submodule of this repo.
122 | 
123 | To train the network, you need to put all figures and labels into one directory. Then generate a  file called `train.list` in `/data`. You can generate this file with `ls . | grep -v -- "-label.png" | awk '{print "PATH_TO_FILES/"$1}' > ../all.list` in the directory with all the images. Then split the file into training and test data.
124 | 
125 | Then train the network with `./darknet writing train cfg/writing.cfg`. This will generate a weight file every now and then. If for some reason some files are missing labels, use a python script like this to filter out files that don't have labels.
126 | 
127 | ```python
128 | import sys
129 | import os.path
130 | 
131 | with open(sys.argv[1]) as f:
132 |         for fname in f:
133 |                 fname = fname.strip()
134 |                 if not os.path.isfile(fname):
135 |                         print fname
136 |                 lname = fname[:-4] + "-label.png"
137 |                 if not os.path.isfile(lname):
138 |                         print fname
139 | ```
140 | 
141 | ## Predict where text is and find text areas
142 | 
143 | You need a trained network. To test the network, run `echo "PATH_TO_FILES/FIGURE.png" | ./darknet writing test cfg/writing.cfg ../writing_backup/writing_ITER.weights`. If you append `out`, a prediction will be written to `out.png`.
144 | 
145 | A prediction looks like this
146 | 
147 | ![Red boxes around extracted text](https://raw.githubusercontent.com/domoritz/label_generator/master/screenshots/hep-th0401120-Figure-23-prediction.png)
148 | 
149 | If you want to test the network on all your test data, use a script like
150 | 
151 | ```bash
152 | for i in `cat $1` ; do
153 |     fname=`basename $i .png`
154 |     echo $i | ./darknet writing test cfg/writing.cfg ../writing_backup/writing_8500.weights PATH_FOR_PREDICTIONS/$fname-predicted
155 | done
156 | ```
157 | 
158 | and run it with your list of training data as the input. This will write all the predictions into a directory. If you feel like moving all your other files (the ground truth, images and such), use a command like `cat test.list | xargs cp -t PATH_FOR_PREDICTIONS`.
159 | 
160 | Cool, now we have a bunch of images in one directory. Let's find out what the precision and recall are. First, create a list of all the files in the directory with `ls | grep -- "-predicted.png" > _all.list`. Then just run `python rate.py ../predicted/predicted/_all.list`.
161 | 
162 | After all this work, we can finally generate a prediction, find contours, fit boxes around contours and find text with tesseract. To do so, run `python predict.py PREDICTION FIGURE_IMAGE --debug`. You may see something like
163 | 
164 | ![Red boxes around extracted text](https://raw.githubusercontent.com/domoritz/label_generator/master/screenshots/text-debug.png)
165 | 
166 | ## Support
167 | 
168 | Please ask questions and files issues [on GitHub](https://github.com/domoritz/label_generator/issues/new).
169 | 
170 | ## Contribute
171 | 
172 | Contributions are welcome. Development happens on GitHub at [domoritz/label_generator](https://github.com/domoritz/label_generator). When sending a pull request, please compare the output of `python label_gen.py read testdata/paper.pdf /tmp/test` with the images in [`testoutput`](https://github.com/domoritz/label_generator/tree/master/testoutput).
173 | 


--------------------------------------------------------------------------------
/config_sample.py:
--------------------------------------------------------------------------------
1 | # credentials
2 | 
3 | # s3 keys
4 | access_key = None
5 | secret_key = None
6 | 


--------------------------------------------------------------------------------
/find_bad.py:
--------------------------------------------------------------------------------
  1 | """Find bad figures
  2 | 
  3 | Reads all JSON files in an S3 bucket and generates a list of files that are
  4 | probably not good labels. A bad label is a label with very few or very many
  5 | text boxes. This could happen if an image is used as a figure with text that
  6 | is not embedded in the PDF. To avoid false negatives, we should exclude
  7 | those even if they have a little bit of text in  them from for example a
  8 | caption or some other text in the document.
  9 | 
 10 | Criteria for exclusion:
 11 |  - no or very few labels
 12 |  - only text close to the border
 13 |  - almost all of the picture is white
 14 | 
 15 | Usage:
 16 |   find_bad.py read-s3 S3-BUCKET S3-PATH [--chunk=CHUNK] [--of=OF] [--debug]
 17 |   find_bad.py read PATH [--debug]
 18 |   find_bad.py check FILE [--debug]
 19 |   find_bad.py (-h | --help)
 20 |   find_bad.py --version
 21 | 
 22 | Options:
 23 |   --chunk=CHUNK   Which part [default: 0]
 24 |   --of=OF         Of how many [default: 1]
 25 |   --debug         Write debug output.
 26 |   -h --help       Show this screen.
 27 |   --version       Show version.
 28 | """
 29 | 
 30 | import os
 31 | import re
 32 | import logging
 33 | import json
 34 | import sys
 35 | import time
 36 | 
 37 | from docopt import docopt
 38 | from boto.s3.connection import S3Connection
 39 | 
 40 | import config
 41 | 
 42 | PATTERN = re.compile('(.*-Figure-[0-9]+).*\.json')
 43 | 
 44 | 
 45 | # true if a is in b
 46 | def contains(a, b):
 47 |     return (a[0] > b[0] and a[2] < b[2] and
 48 |             a[1] > b[1] and a[3] < b[3])
 49 | 
 50 | 
 51 | def area(a):
 52 |     return (a[2] - a[0]) * (a[3] - a[1])
 53 | 
 54 | 
 55 | def all_in_border(bounds, texts):
 56 |     """ Returns true if all text is either in upper or lower border area. """
 57 |     x0, y0, x1, y1 = bounds
 58 |     h = y1 - y0
 59 | 
 60 |     not_top_border = [x0, y0 + h*0.05,
 61 |                       x1, y1]
 62 |     not_bottom_border = [x0, y0,
 63 |                          x1, y1 - h*0.05]
 64 | 
 65 |     top = False
 66 |     bottom = False
 67 | 
 68 |     for text in texts:
 69 |         if contains(text['TextBB'], not_top_border):
 70 |             top = True
 71 | 
 72 |         if contains(text['TextBB'], not_bottom_border):
 73 |             bottom = True
 74 | 
 75 |         if top and bottom:
 76 |             return False
 77 | 
 78 |     return True
 79 | 
 80 | 
 81 | def is_sum_larger(max_area, texts):
 82 |     sum_so_far = 0
 83 |     for text in texts:
 84 |         sum_so_far += area(text['TextBB'])
 85 |         if sum_so_far > max_area:
 86 |             return True
 87 |     return False
 88 | 
 89 | 
 90 | def check(json_data):
 91 |     data = json.loads(json_data)
 92 |     texts = data['ImageText']
 93 | 
 94 |     # no text at all
 95 |     if len(texts) == 0:
 96 |         logging.debug("No text")
 97 |         return True
 98 | 
 99 |     # very little text
100 |     if len(texts) == 1:
101 |         logging.debug("One text label")
102 |         return True
103 | 
104 |     # all the text is within the border area (probably an artifact)
105 |     if all_in_border(data['ImageBB'], texts):
106 |         logging.debug("All text is in upper or lower border")
107 |         return True
108 | 
109 |     # almost the whole image is text
110 |     # use crude implementation where we just sum up the text area
111 |     # and kick something out if >50% is text
112 |     a = area(data['ImageBB'])
113 |     if is_sum_larger(a*0.5, texts):
114 |         logging.debug("Almost everything is text")
115 |         return True
116 | 
117 |     return False
118 | 
119 | 
120 | def run_s3(bucket_name, path, chunk, of):
121 |     conn = S3Connection(config.access_key, config.secret_key, is_secure=False)
122 |     bucket = conn.get_bucket(bucket_name)
123 | 
124 |     print >> sys.stderr, "Run {} of {}".format(chunk, of)
125 | 
126 |     start = time.time()
127 | 
128 |     for i, key in enumerate(bucket.list(path)):
129 |         if i % 1000 == 0:
130 |             so_far = time.time() - start
131 |             logging.info("Processing number {} after {} seconds".format(i, so_far))
132 | 
133 |         if i % of == chunk:
134 |             if key.name.strip('/') == path.strip('/'):
135 |                 # ignore the directory itself
136 |                 continue
137 |             if os.path.splitext(key.name)[1] == '.json':
138 |                 if check(key.get_contents_as_string()):
139 |                     groups = PATTERN.search(os.path.basename(key.name))
140 |                     if groups:
141 |                         print(groups.group(1))
142 |             else:
143 |                 logging.error("Not a json file {}".format(key.name))
144 | 
145 | 
146 | def run_local(path):
147 |     for name in os.listdir(path):
148 |         json_file = os.path.join(path, name)
149 |         if os.path.isfile(json_file):
150 |             if os.path.splitext(name)[1] == '.json':
151 | 
152 |                 with open(json_file) as f:
153 |                     if check(f.read()):
154 |                         groups = PATTERN.search(name)
155 |                         if groups:
156 |                             print(groups.group(1))
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     arguments = docopt(__doc__, version='Rater 1.0')
161 | 
162 |     if arguments['--debug']:
163 |         logging.basicConfig(level=logging.DEBUG)
164 | 
165 |     if arguments['read-s3']:
166 |         run_s3(arguments['S3-BUCKET'], arguments['S3-PATH'],
167 |                int(arguments['--chunk']), int(arguments['--of']))
168 |     elif arguments['read']:
169 |         run_local(arguments['PATH'])
170 |     elif arguments['check']:
171 |         with open(arguments['FILE']) as f:
172 |             print("Bad label" if check(f.read()) else "Good label")
173 | 


--------------------------------------------------------------------------------
/label_gen.py:
--------------------------------------------------------------------------------
  1 | """Figure extractor and label generator
  2 | 
  3 | Read a single PDF file and write the extracted data and labels
  4 | to a directory with the following structure:
  5 | 
  6 |  /json
  7 |    - filename_figno.json
  8 |  /img
  9 |    - filename_figno.png
 10 |    - filename_figno_2x.png (200 DPI)
 11 |    - filename_figno_3x.png (300 DPI)
 12 |    - filename_figno_4x.png (400 DPI)
 13 |  /text-masked
 14 |   - filename_figno_box.png
 15 |   - filename_figno_mask.png
 16 | 
 17 | Usage:
 18 |   label_gen.py read-s3 S3-IN-BUCKET S3-FILE S3-OUT-BUCKET S3-PATH [--use-ramdisk] [--debug] [--dbg-image]
 19 |   label_gen.py read FILE PATH [--debug] [--dbg-image]
 20 |   label_gen.py (-h | --help)
 21 |   label_gen.py --version
 22 | 
 23 | Options:
 24 |   --use-ramdisk   Store temporary files in /tmp/ram/.
 25 |   --debug         Write debug output.
 26 |   --dbg-image     Create a debug label.
 27 |   -h --help       Show this screen.
 28 |   --version       Show version.
 29 | """
 30 | 
 31 | import tempfile
 32 | import shutil
 33 | import subprocess
 34 | import os
 35 | import json
 36 | import logging
 37 | 
 38 | from docopt import docopt
 39 | from boto.s3.connection import S3Connection
 40 | from boto.s3.key import Key
 41 | 
 42 | import config
 43 | import render
 44 | import label_image
 45 | 
 46 | 
 47 | DEBUG = False
 48 | 
 49 | 
 50 | def create_dir(directory):
 51 |     if not os.path.exists(directory):
 52 |         os.makedirs(directory)
 53 | 
 54 | 
 55 | def run_local(pdf_file, path, debug_image, flat):
 56 |     filepath = os.path.abspath(pdf_file)
 57 |     outpath = os.path.abspath(path)
 58 | 
 59 |     ident = os.path.splitext(os.path.basename(pdf_file))[0]
 60 | 
 61 |     if flat:
 62 |         # cheaper because we don't need separate directories
 63 |         json_path = outpath
 64 |         img_path = outpath
 65 |         label_path = outpath
 66 |     else:
 67 |         json_path = os.path.join(outpath, 'json')
 68 |         img_path = os.path.join(outpath, 'img')
 69 |         label_path = os.path.join(outpath, 'text-masked')
 70 | 
 71 |         # create directories, if needed
 72 |         create_dir(json_path)
 73 |         create_dir(img_path)
 74 |         create_dir(label_path)
 75 | 
 76 |     outident_json = os.path.join(json_path, ident)
 77 | 
 78 |     # generate the json for figures
 79 |     logging.debug('Run pdffigures {}'.format(filepath))
 80 |     DEVNULL = open(os.devnull, 'w')
 81 |     subprocess.call(['pdffigures/pdffigures', '-j',
 82 |                     outident_json, filepath], stdout=DEVNULL, stderr=DEVNULL)
 83 | 
 84 |     json_files = []
 85 |     img_files = []
 86 |     label_files = []
 87 | 
 88 |     logging.debug("Finished. Now look for the JSON and generate labels.")
 89 | 
 90 |     # pdffigures now generates only a singe JSON file, we need one file per figure
 91 |     # https://github.com/allenai/pdffigures/commit/8ffcaceab3fdc97ec489c58e87191b7e12c0134a
 92 | 
 93 |     json_file = '{}.json'.format(outident_json)
 94 | 
 95 |     if os.path.isfile(json_file):
 96 |         with open(json_file) as fh:
 97 |             figures = json.load(fh)
 98 | 
 99 | 
100 |             logging.debug('Found {} figures'.format(len(figures)))
101 | 
102 |             for index, figure in enumerate(figures):
103 |                 chart_json = '{}-Figure-{}.json'.format(outident_json, index)
104 |                 json_files.append(chart_json)
105 | 
106 |                 with open(chart_json, 'w') as jfh:
107 |                     json.dump(figure, jfh)
108 | 
109 |                 def image_path(factor):
110 |                     ext = '' if factor == 1 else '-{}x'.format(factor)
111 |                     name = '{}-Figure-{}{}.png'.format(ident, index, ext)
112 |                     return os.path.join(img_path, name)
113 | 
114 |                 # render image with different resolutions
115 |                 for factor in [1, 2]:
116 |                     image_file = image_path(factor)
117 |                     logging.debug('Render image {} from {}'.format(
118 |                         image_file, filepath))
119 | 
120 |                     render.render_chart(filepath, figure['Page']-1,
121 |                                         figure['ImageBB'],
122 |                                         int(factor*100), image_file)
123 |                     img_files.append(image_file)
124 | 
125 |                 # labeled image
126 |                 output = os.path.join(
127 |                     label_path, '{}-Figure-{}-label.png'.format(
128 |                         ident, index, factor))
129 |                 dbg_output = None
130 |                 if debug_image:
131 |                     dbg_output = os.path.join(
132 |                         label_path, '{}-Figure-{}-dbg.png'.format(
133 |                             ident, index, factor))
134 | 
135 |                 logging.debug('generate label {}'.format(output))
136 |                 if label_image.gen_labeled_image(
137 |                         figure, image_path(1), output, dbg_output, DEBUG):
138 |                     # yes, a labeled file was generated
139 |                     label_files.append(output)
140 |                     if dbg_output:
141 |                         label_files.append(dbg_output)
142 | 
143 |         # remove the one json file with data for all figures
144 |         os.remove(json_file)
145 | 
146 |     return json_files, img_files, label_files
147 | 
148 | 
149 | def run_s3(in_bucket_name, filename, out_bucket_name, path, ramtemp, debug_image):
150 |     conn = S3Connection(config.access_key, config.secret_key, is_secure=False)
151 |     in_bucket = conn.get_bucket(in_bucket_name)
152 |     out_bucket = conn.get_bucket(out_bucket_name)
153 | 
154 |     dirpath = tempfile.mkdtemp(dir='/tmp/ram/' if ramtemp else None)
155 |     logging.debug('Temp directory in {}'.format(dirpath))
156 | 
157 |     try:
158 |         # copy into temp
159 |         key = Key(in_bucket, filename)
160 |         target = os.path.join(dirpath, os.path.basename(filename))
161 |         key.get_contents_to_filename(target)
162 | 
163 |         # run algos
164 |         files = run_local(target, dirpath, debug_image, True)
165 | 
166 |         # write files back to s3
167 |         for f in files[0]:
168 |             key = Key(out_bucket, os.path.join(path, 'json', os.path.basename(f)))
169 |             key.set_contents_from_filename(f)
170 |         for f in files[1]:
171 |             key = Key(out_bucket, os.path.join(path, 'img', os.path.basename(f)))
172 |             key.set_contents_from_filename(f)
173 |         for f in files[2]:
174 |             key = Key(out_bucket, os.path.join(
175 |                 path, 'text-masked', os.path.basename(f)))
176 |             key.set_contents_from_filename(f)
177 |     finally:
178 |         shutil.rmtree(dirpath)
179 | 
180 | 
181 | if __name__ == '__main__':
182 |     arguments = docopt(__doc__, version='Extractor 1.0')
183 | 
184 |     if arguments['--debug']:
185 |         DEBUG = True
186 |         logging.basicConfig(level=logging.DEBUG)
187 |         logging.getLogger("boto").setLevel(logging.WARNING)
188 | 
189 |     if arguments['read-s3']:
190 |         run_s3(arguments['S3-IN-BUCKET'], arguments['S3-FILE'],
191 |                arguments['S3-OUT-BUCKET'], arguments['S3-PATH'],
192 |                arguments['--use-ramdisk'], arguments['--dbg-image'])
193 |     elif arguments['read']:
194 |         run_local(arguments['FILE'], arguments['PATH'],
195 |                   arguments['--dbg-image'], False)
196 | 


--------------------------------------------------------------------------------
/label_image.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import math
 3 | import sys
 4 | import logging
 5 | 
 6 | import numpy as np
 7 | import cv2
 8 | 
 9 | WHITE = (255, 255, 255)
10 | RED = (0, 0, 255)
11 | factor = 1
12 | 
13 | 
14 | def gen_labeled_image(description, image, target, dbg_output=None, debug=False):
15 |     bounds = np.array(description['ImageBB'])
16 | 
17 |     bounds *= factor
18 | 
19 |     x0 = bounds[0]
20 |     y0 = bounds[1]
21 |     w = bounds[2] - x0
22 |     h = bounds[3] - y0
23 | 
24 |     chart = cv2.imread(image, cv2.CV_LOAD_IMAGE_GRAYSCALE)
25 |     # h, w = chart.shape
26 | 
27 |     texts = description['ImageText']
28 | 
29 |     if len(texts) == 0:
30 |         logging.debug("""No text boxes in chart. Since this could mean that the image
31 |             does not have embedded text, we are ignoring it.""")
32 |         return False
33 | 
34 |     label = np.zeros((h, w), np.uint8)
35 | 
36 |     for text_box in texts:
37 |         tb = np.array(text_box['TextBB'])
38 |         tb *= factor
39 | 
40 |         tx0 = int(math.floor(tb[0] - x0))
41 |         ty0 = int(math.floor(tb[1] - y0))
42 |         tx1 = int(math.ceil(tb[2] - x0))
43 |         ty1 = int(math.ceil(tb[3] - y0))
44 | 
45 |         # label exactly the text
46 |         # patch = chart[ty0:ty1, tx0:tx1]
47 |         # ret, patch = cv2.threshold(patch, 0, 255,
48 |         #                            cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
49 |         # label[ty0:ty1, tx0:tx1] = patch
50 | 
51 |         # label a box around the text
52 |         cv2.rectangle(label, (tx0, ty0), (tx1, ty1), WHITE, cv2.cv.CV_FILLED)
53 | 
54 |     # dilate the label slightly
55 |     kernel = np.ones((2, 2), np.uint8)
56 |     label = cv2.dilate(label, kernel, iterations=2)
57 | 
58 |     cv2.imwrite(target, label)
59 | 
60 |     if dbg_output:
61 |         # convert back to rgb
62 |         label = cv2.cvtColor(label, cv2.COLOR_GRAY2RGB)
63 |         chart = cv2.cvtColor(chart, cv2.COLOR_GRAY2RGB)
64 | 
65 |         # remove blue so that we can have colorful debug output
66 |         label[:, :, 2] = 0
67 | 
68 |         cv2.subtract(chart, label, dst=label)
69 |         chart = cv2.addWeighted(chart, 0.65, label, 0.35, 0)
70 |         cv2.imwrite(dbg_output, chart)
71 | 
72 |         if debug:
73 |             cv2.imshow('dbg_label', chart)
74 |             cv2.waitKey(0)
75 |             cv2.destroyAllWindows()
76 | 
77 |     return True
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     with open(sys.argv[1] + '.json') as data_file:
82 |         data = json.load(data_file)
83 |         gen_labeled_image(data, sys.argv[1] + '.png', 'label.png', 'debug.png')
84 | 


--------------------------------------------------------------------------------
/paper.bib:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | @article{tange2011gnu,
  4 | 	Author = {Tange, Ole and others},
  5 | 	Date-Added = {2015-06-09 05:05:54 +0000},
  6 | 	Date-Modified = {2015-06-09 05:05:54 +0000},
  7 | 	Journal = {The USENIX Magazine},
  8 | 	Number = {1},
  9 | 	Pages = {42--47},
 10 | 	Title = {Gnu parallel-the command-line power tool},
 11 | 	Volume = {36},
 12 | 	Year = {2011}}
 13 | 
 14 | @article{dantas2007revisiting,
 15 | 	Author = {Dantas, WG and de Oliveira, MJ and Stilck, JF},
 16 | 	Date-Added = {2015-06-09 04:54:35 +0000},
 17 | 	Date-Modified = {2015-06-09 04:54:35 +0000},
 18 | 	Journal = {Journal of Statistical Mechanics: Theory and Experiment},
 19 | 	Number = {08},
 20 | 	Pages = {P08009},
 21 | 	Publisher = {IOP Publishing},
 22 | 	Title = {Revisiting the one-dimensional diffusive contact process},
 23 | 	Volume = {2007},
 24 | 	Year = {2007}}
 25 | 
 26 | @inproceedings{epshtein2010detecting,
 27 | 	Author = {Epshtein, Boris and Ofek, Eyal and Wexler, Yonatan},
 28 | 	Booktitle = {Computer Vision and Pattern Recognition (CVPR), 2010 IEEE Conference on},
 29 | 	Date-Added = {2015-06-09 04:24:18 +0000},
 30 | 	Date-Modified = {2015-06-09 04:24:18 +0000},
 31 | 	Organization = {IEEE},
 32 | 	Pages = {2963--2970},
 33 | 	Title = {Detecting text in natural scenes with stroke width transform},
 34 | 	Year = {2010}}
 35 | 
 36 | @inproceedings{chen2011robust,
 37 | 	Author = {Chen, Huizhong and Tsai, Sam S and Schroth, Georg and Chen, David M and Grzeszczuk, Radek and Girod, Bernd},
 38 | 	Booktitle = {Image Processing (ICIP), 2011 18th IEEE International Conference on},
 39 | 	Date-Added = {2015-06-09 04:18:38 +0000},
 40 | 	Date-Modified = {2015-06-09 04:18:38 +0000},
 41 | 	Organization = {IEEE},
 42 | 	Pages = {2609--2612},
 43 | 	Title = {Robust text detection in natural images with edge-enhanced maximally stable extremal regions},
 44 | 	Year = {2011}}
 45 | 
 46 | @article{azuma2004matrix,
 47 | 	Author = {Azuma, Takehiro},
 48 | 	Date-Added = {2015-06-09 03:47:46 +0000},
 49 | 	Date-Modified = {2015-06-09 03:47:46 +0000},
 50 | 	Journal = {arXiv preprint hep-th/0401120},
 51 | 	Title = {Matrix models and the gravitational interaction},
 52 | 	Year = {2004}}
 53 | 
 54 | @inproceedings{schulz2011object,
 55 | 	Author = {Schulz, Hannes and Behnke, Sven},
 56 | 	Booktitle = {Proceedings of the DAGM Workshop on New Challenges in Neural Computation},
 57 | 	Date-Added = {2015-06-08 18:02:05 +0000},
 58 | 	Date-Modified = {2015-06-08 18:02:05 +0000},
 59 | 	Pages = {58--61},
 60 | 	Title = {Object-class segmentation using deep convolutional neural networks},
 61 | 	Year = {2011}}
 62 | 
 63 | @inproceedings{szegedy2013deep,
 64 | 	Author = {Szegedy, Christian and Toshev, Alexander and Erhan, Dumitru},
 65 | 	Booktitle = {Advances in Neural Information Processing Systems},
 66 | 	Date-Added = {2015-06-08 17:56:26 +0000},
 67 | 	Date-Modified = {2015-06-08 17:56:26 +0000},
 68 | 	Pages = {2553--2561},
 69 | 	Title = {Deep neural networks for object detection},
 70 | 	Year = {2013}}
 71 | 
 72 | @article{goodfellow2013multi,
 73 | 	Author = {Goodfellow, Ian J and Bulatov, Yaroslav and Ibarz, Julian and Arnoud, Sacha and Shet, Vinay},
 74 | 	Date-Added = {2015-06-08 17:51:08 +0000},
 75 | 	Date-Modified = {2015-06-08 17:51:08 +0000},
 76 | 	Journal = {arXiv preprint arXiv:1312.6082},
 77 | 	Title = {Multi-digit number recognition from street view imagery using deep convolutional neural networks},
 78 | 	Year = {2013}}
 79 | 
 80 | @article{lindeberg2012scale,
 81 | 	Author = {Lindeberg, Tony},
 82 | 	Date-Added = {2015-06-07 21:09:10 +0000},
 83 | 	Date-Modified = {2015-06-07 21:09:10 +0000},
 84 | 	Journal = {Scholarpedia},
 85 | 	Number = {5},
 86 | 	Pages = {10491},
 87 | 	Title = {Scale invariant feature transform},
 88 | 	Volume = {7},
 89 | 	Year = {2012}}
 90 | 
 91 | @inproceedings{le1990handwritten,
 92 | 	Author = {Le Cun, B Boser and Denker, John S and Henderson, D and Howard, Richard E and Hubbard, W and Jackel, Lawrence D},
 93 | 	Booktitle = {Advances in neural information processing systems},
 94 | 	Date-Added = {2015-06-07 21:01:56 +0000},
 95 | 	Date-Modified = {2015-06-07 21:01:56 +0000},
 96 | 	Organization = {Citeseer},
 97 | 	Title = {Handwritten digit recognition with a back-propagation network},
 98 | 	Year = {1990}}
 99 | 
100 | @inproceedings{savva2011revision,
101 | 	Author = {Savva, Manolis and Kong, Nicholas and Chhajta, Arti and Fei-Fei, Li and Agrawala, Maneesh and Heer, Jeffrey},
102 | 	Booktitle = {Proceedings of the 24th annual ACM symposium on User interface software and technology},
103 | 	Date-Added = {2015-06-07 20:02:58 +0000},
104 | 	Date-Modified = {2015-06-07 20:02:58 +0000},
105 | 	Organization = {ACM},
106 | 	Pages = {393--402},
107 | 	Title = {Revision: Automated classification, analysis and redesign of chart images},
108 | 	Year = {2011}}
109 | 
110 | @article{clarklooking,
111 | 	Author = {Clark, Christopher and Divvala, Santosh},
112 | 	Date-Added = {2015-06-07 20:02:19 +0000},
113 | 	Date-Modified = {2015-06-07 20:02:19 +0000},
114 | 	Title = {Looking Beyond Text: Extracting Figures, Tables and Captions from Computer Science Papers}}
115 | 
116 | @inproceedings{smith2007overview,
117 | 	Author = {Smith, Ray},
118 | 	Booktitle = {ICDAR},
119 | 	Date-Added = {2015-06-07 20:01:04 +0000},
120 | 	Date-Modified = {2015-06-07 20:01:04 +0000},
121 | 	Number = {1},
122 | 	Pages = {629--633},
123 | 	Title = {An Overview of the Tesseract OCR Engine.},
124 | 	Volume = {7},
125 | 	Year = {2007}}
126 | 	
127 | @misc{darknet13,
128 |   author =   {Joseph Redmon},
129 |   title =    {Darknet: Open Source Neural Networks in C},
130 |   howpublished = {\url{http://pjreddie.com/darknet/}},
131 |   year = {2013--2016}
132 | }
133 | 


--------------------------------------------------------------------------------
/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Text detection in screen images with a Convolutional Neural Network'
 3 | tags:
 4 |   - deep learning
 5 |   - visualization
 6 |   - text detection
 7 | authors:
 8 |  - name: Dominik Moritz
 9 |    orcid: 0000-0002-3110-1053
10 |    affiliation: 1
11 | affiliations:
12 |  - name: University of Washington
13 |    index: 1
14 | date: 9 March 2017
15 | bibliography: paper.bib
16 | ---
17 | 
18 | # Summary
19 | 
20 | The repository contains a set of scripts to implement text detection from screen images.
21 | The idea is that we use a Convolutional Neural Network (CNN) [@le1990handwritten] to predict a heatmap of the probability of text in an image.
22 | The network outputs a heatmap for text with 64 × 64 pixels and is implemented in Darknet [@darknet13].
23 | To train the network, we use a set of pairs of images and training labels.
24 | We obtain the training data by extracting figures with embedded text from research papers in PDF form and generated pixel masks from them.
25 | 
26 | With the code, we also provide a dataset of around 500K labeled images extracted from 1M papers from arXiv and the ACL anthology.
27 | 
28 | # References
29 | 


--------------------------------------------------------------------------------
/poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/poster.pdf


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | """Finds and reads text in an image.
  2 | 
  3 | Usage:
  4 |   predict.py TEXT_MASK IMAGE [--thresh=THRESH] [--debug]
  5 |   predict.py (-h | --help)
  6 |   predict.py --version
  7 | 
  8 | Options:
  9 |   --thresh=THRESH   Threshold for mask image [default: 200].
 10 |   --debug           Write debug output.
 11 |   -h --help         Show this screen.
 12 |   --version         Show version.
 13 | """
 14 | 
 15 | import logging
 16 | import copy
 17 | 
 18 | import numpy as np
 19 | import cv2
 20 | from docopt import docopt
 21 | import pytesseract
 22 | from PIL import Image
 23 | # from skimage.restoration import denoise_tv_chambolle
 24 | 
 25 | 
 26 | RED = (0, 0, 255)
 27 | GREEN = (0, 255, 0)
 28 | BLUE = (255, 0, 0)
 29 | BLACK = (0, 0, 0)
 30 | 
 31 | DEBUG = False
 32 | 
 33 | 
 34 | def cvToPIL(image):
 35 |     cv2_im = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 36 |     return Image.fromarray(cv2_im)
 37 | 
 38 | 
 39 | def subimage(image, center, theta, width, height):
 40 |     theta *= np.pi / 180  # convert to rad
 41 | 
 42 |     v_x = (np.cos(theta), np.sin(theta))
 43 |     v_y = (-np.sin(theta), np.cos(theta))
 44 |     s_x = center[0] - v_x[0] * (width / 2) - v_y[0] * (height / 2)
 45 |     s_y = center[1] - v_x[1] * (width / 2) - v_y[1] * (height / 2)
 46 | 
 47 |     mapping = np.array([[v_x[0], v_y[0], s_x],
 48 |                         [v_x[1], v_y[1], s_y]])
 49 | 
 50 |     return cv2.warpAffine(image, mapping, (int(width), int(height)),
 51 |                           flags=cv2.WARP_INVERSE_MAP,
 52 |                           borderMode=cv2.BORDER_REPLICATE)
 53 | 
 54 | 
 55 | def predict_text(mask, image, thresh):
 56 |     mask = cv2.imread(mask, cv2.CV_LOAD_IMAGE_GRAYSCALE)
 57 |     image = cv2.imread(image)
 58 | 
 59 |     h, w, _ = image.shape
 60 |     mask = cv2.resize(mask, (w, h))
 61 | 
 62 |     # add borders
 63 |     b = 12
 64 |     image = cv2.copyMakeBorder(image, b, b, b, b, cv2.BORDER_REPLICATE)
 65 |     mask = cv2.copyMakeBorder(mask, b, b, b, b,
 66 |                               cv2.BORDER_CONSTANT, value=BLACK)
 67 | 
 68 |     # laplace = cv2.Laplacian(mask, cv2.CV_16S, ksize=3, scale=1, delta=0)
 69 |     # laplace = cv2.convertScaleAbs(laplace)
 70 |     # blur = cv2.GaussianBlur(mask, (5, 5), 0)
 71 |     # mask = cv2.addWeighted(mask, 1.5, blur, -0.5, 0)
 72 | 
 73 |     # idea is to increase separation but doesn't work
 74 |     # mask = denoise_tv_chambolle(mask, weight=10, n_iter_max=50)
 75 |     # mask = np.array(mask*255, np.uint8)
 76 | 
 77 |     if DEBUG:
 78 |         dbg_img = copy.copy(image)
 79 |         cv2.imshow('mask', mask)
 80 | 
 81 |     # print pytesseract.image_to_string(cvToPIL(image), config='-psm 3')
 82 | 
 83 |     # threshold the prediction
 84 |     _, thresh = cv2.threshold(mask, thresh, 255, cv2.THRESH_BINARY)
 85 | 
 86 |     # _, thresh = cv2.threshold(mask, 0, 255,
 87 |     #                         cv2.THRESH_BINARY+cv2.THRESH_OTSU)
 88 | 
 89 |     # dilate + erosion
 90 |     # size = 5
 91 |     # kernel = np.ones((size, size), np.uint8)
 92 |     # thresh = cv2.erode(thresh, kernel, iterations=1)
 93 |     # thresh = cv2.dilate(thresh, kernel, iterations=1)
 94 | 
 95 |     if DEBUG:
 96 |         cv2.imshow('thresh', thresh)
 97 |         # cv2.imwrite('thresholded.png', thresh)
 98 | 
 99 |     contours, hierarchy = cv2.findContours(thresh,
100 |                                            cv2.cv.CV_RETR_LIST,
101 |                                            cv2.cv.CV_CHAIN_APPROX_SIMPLE)
102 | 
103 |     for contour in contours:
104 |         # box = cv2.boundingRect(contour)
105 |         # x, y, w, h = scale*np.array(box)
106 |         # cv2.rectangle(image, (int(x), int(y)), (int(x+w), int(y+h)), RED, 3)
107 | 
108 |         rect = cv2.minAreaRect(contour)
109 | 
110 |         # increase size of rect
111 |         dims = rect[1]
112 |         dims = (dims[0]*1.1, dims[1]*1.1)
113 | 
114 |         # snap rotation
115 |         angles = [-360, -270, -180, -90, 0, 90, 180, 270, 360]
116 |         theta = rect[2]
117 |         epsylon = 5  # how large should the snap be
118 |         for a in angles:
119 |             if a-epsylon <= theta <= a+epsylon:
120 |                 theta = a
121 | 
122 |         rect = rect[0], dims, theta
123 | 
124 |         if DEBUG:
125 |             box = cv2.cv.BoxPoints(rect)
126 |             box_np = np.int0(box)
127 |             cv2.drawContours(dbg_img, [box_np], 0, (0, 0, 255), 2)
128 | 
129 |         # skip OCR
130 |         continue
131 | 
132 |         center, (w, h), theta = rect
133 |         patch = subimage(image, center, theta, w, h)
134 | 
135 |         for x in range(4):
136 |             text = pytesseract.image_to_string(cvToPIL(patch))
137 | 
138 |             cv2.putText(dbg_img, text, (int(center[0]), int(10+center[1] + 7*x)), cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.6, BLUE)
139 |             print(text)
140 | 
141 |             # cv2.imshow('patch', patch)
142 |             # cv2.waitKey(0)
143 | 
144 |             patch = cv2.transpose(patch)
145 |             patch = cv2.flip(patch, 0)
146 | 
147 |         print("======")
148 | 
149 |     if DEBUG:
150 |         cv2.imshow('image', dbg_img)
151 |         cv2.imwrite('text-debug.png', dbg_img)
152 |         cv2.waitKey(0)
153 |         cv2.destroyAllWindows()
154 | 
155 | 
156 | if __name__ == '__main__':
157 |     arguments = docopt(__doc__, version='Predictor 1.0')
158 | 
159 |     if arguments['--debug']:
160 |         logging.basicConfig(level=logging.DEBUG)
161 |         DEBUG = True
162 | 
163 |     predict_text(arguments['TEXT_MASK'], arguments['IMAGE'],
164 |                  int(arguments['--thresh']))
165 | 


--------------------------------------------------------------------------------
/rate.py:
--------------------------------------------------------------------------------
  1 | """Calculate the difference between predictions and ground truth.
  2 | 
  3 | Provide a list of predicted files. In the same directory should
  4 | also be the label files. This script assumes correct filenames.
  5 | 
  6 | Usage:
  7 |   main.py LIST [--thresh=THRESH] [--debug]
  8 |   main.py (-h | --help)
  9 |   main.py --version
 10 | 
 11 | Options:
 12 |   --thresh=THRESH   Threshold for predicted image [default: 200].
 13 |   --debug           Write debug output.
 14 |   -h --help         Show this screen.
 15 |   --version         Show version.
 16 | """
 17 | 
 18 | import logging
 19 | import os.path
 20 | 
 21 | import numpy as np
 22 | import cv2
 23 | from docopt import docopt
 24 | 
 25 | 
 26 | DEBUG = False
 27 | 
 28 | 
 29 | def calculate_diff(label_list, thresh):
 30 |     with open(label_list) as f:
 31 | 
 32 |         all_fp = 0.0
 33 |         all_fn = 0.0
 34 |         all_tp = 0.0
 35 | 
 36 |         where = os.path.dirname(label_list)
 37 |         for line in f:
 38 |             pred = os.path.join(where, line.strip())
 39 |             fname = os.path.basename(
 40 |                 pred)[:-14] + "-label.png"
 41 |             truth = os.path.join(where, fname)
 42 | 
 43 |             if not os.path.isfile(pred) or not os.path.isfile(truth):
 44 |                 print("Not found:", pred, truth)
 45 |                 continue
 46 | 
 47 |             truth = cv2.imread(truth, cv2.CV_LOAD_IMAGE_GRAYSCALE)
 48 |             pred = cv2.imread(pred, cv2.CV_LOAD_IMAGE_GRAYSCALE)
 49 | 
 50 |             # resize to predicted image size
 51 |             h, w = pred.shape
 52 |             truth = cv2.resize(truth, (w, h))
 53 | 
 54 |             # threshold to get bw image
 55 |             _, pred = cv2.threshold(pred, thresh, 255, cv2.THRESH_BINARY)
 56 | 
 57 |             # threshold because of scaling interpolation
 58 |             _, truth = cv2.threshold(truth, 127, 255, cv2.THRESH_BINARY)
 59 | 
 60 |             # dilate to account for almost right predictions (see alternative below)
 61 |             kernel = np.ones((3, 3), np.uint8)
 62 |             truth_dil = cv2.dilate(truth, kernel, iterations=3)
 63 |             pred_dil = cv2.dilate(pred, kernel, iterations=3)
 64 | 
 65 |             # no dilation
 66 |             # truth_dil = truth
 67 |             # pred_dil = pred
 68 | 
 69 |             fp = np.sum(pred - truth_dil)
 70 |             fn = np.sum(truth - pred_dil)
 71 |             tp = np.sum(cv2.bitwise_and(pred, truth))
 72 | 
 73 |             all_fp += fp
 74 |             all_fn += fn
 75 |             all_tp += tp
 76 | 
 77 |             if DEBUG:
 78 |                 print(fn, fp, tp)
 79 | 
 80 |                 cv2.imshow('truth', truth)
 81 |                 cv2.imshow('predicted', pred)
 82 | 
 83 |                 cv2.imshow('fp', pred - truth_dil)
 84 |                 cv2.imshow('fn', truth - pred_dil)
 85 |                 cv2.imshow('tp', cv2.bitwise_and(pred, truth))
 86 | 
 87 |                 cv2.moveWindow('truth', 100, 10)
 88 |                 cv2.moveWindow('predicted', 300, 10)
 89 |                 cv2.waitKey(0)
 90 |                 cv2.destroyAllWindows()
 91 | 
 92 |         print("Shape:", h, w)
 93 |         print(all_fp, all_fn, all_tp)
 94 | 
 95 |         precision = all_tp / (all_tp + all_fp)
 96 |         recall = all_tp / (all_tp + all_fn)
 97 | 
 98 |         print("Precision:", precision)
 99 |         print("Recall:", recall)
100 |         print("F1 score:", 2 * precision * recall / (precision + recall))
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     arguments = docopt(__doc__, version='Tester 1.0')
105 | 
106 |     if arguments['--debug']:
107 |         logging.basicConfig(level=logging.DEBUG)
108 |         DEBUG = True
109 | 
110 |     calculate_diff(arguments['LIST'],
111 |                    int(arguments['--thresh']))
112 | 


--------------------------------------------------------------------------------
/render.py:
--------------------------------------------------------------------------------
 1 | from wand.image import Image
 2 | from wand.color import Color
 3 | 
 4 | 
 5 | def render_chart(pdf_file, page, bounds, dpi, target):
 6 |     """Renders part of a pdf file with imagemagick.
 7 |     Pass this function the bounds and resolution.
 8 |     """
 9 | 
10 |     pdf_page = pdf_file + '[{}]'.format(page)
11 |     with Image(filename=pdf_page, resolution=dpi) as img:
12 |         factor = 1.0*dpi/100
13 | 
14 |         x0 = bounds[0]
15 |         y0 = bounds[1]
16 |         w = bounds[2] - x0
17 |         h = bounds[3] - y0
18 | 
19 |         img.crop(left=int(x0*factor), top=int(y0*factor),
20 |                  width=int(w*factor), height=int(h*factor))
21 | 
22 |         # put transparent image on white background
23 |         with Image(width=img.width, height=img.height,
24 |                    background=Color("white")) as bg:
25 |             bg.composite(img, 0, 0)
26 |             bg.save(filename=target)
27 | 
28 | if __name__ == '__main__':
29 |     render_chart('testdata/paper.pdf', 1,
30 |                  [100, 200, 500, 500], 200, '/tmp/rendered_region_2x.png')
31 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.3
 2 | astroid==1.4.9
 3 | awscli==1.11.89
 4 | backports.functools-lru-cache==1.3
 5 | boto==2.46.1
 6 | botocore==1.5.52
 7 | click==6.7
 8 | colorama==0.3.7
 9 | configparser==3.5.0
10 | cycler==0.10.0
11 | decorator==4.0.11
12 | docopt==0.6.2
13 | docutils==0.13.1
14 | flake8==2.5.4
15 | functools32==3.2.3.post2
16 | future==0.16.0
17 | futures==3.1.1
18 | isort==4.2.5
19 | jmespath==0.9.2
20 | lazy-object-proxy==1.2.2
21 | mccabe==0.4.0
22 | networkx==1.11
23 | nose==1.3.7
24 | olefile==0.44
25 | packaging==16.8
26 | pep8==1.7.0
27 | Pillow==4.1.1
28 | proselint==0.4.0
29 | protobuf==3.3.0
30 | pyasn1==0.2.3
31 | pyflakes==1.0.0
32 | Pygments==2.2.0
33 | pylint==1.6.4
34 | pyparsing==2.1.10
35 | pytesseract==0.1.6
36 | python-dateutil==2.6.0
37 | pytz==2016.10
38 | PyWavelets==0.5.2
39 | PyYAML==3.12
40 | rsa==3.4.2
41 | s3transfer==0.1.10
42 | scikit-image==0.13.0
43 | six==1.10.0
44 | subprocess32==3.2.7
45 | virtualenv==15.0.3
46 | virtualfish==1.0.1
47 | Wand==0.4.4
48 | wrapt==1.10.8
49 | 


--------------------------------------------------------------------------------
/requirements_unfrozen.txt:
--------------------------------------------------------------------------------
1 | docopt
2 | Wand
3 | boto
4 | awscli
5 | pytesseract
6 | pillow
7 | scikit-image
8 | 


--------------------------------------------------------------------------------
/screenshots/a_debug.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/screenshots/a_debug.png


--------------------------------------------------------------------------------
/screenshots/a_label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/screenshots/a_label.png


--------------------------------------------------------------------------------
/screenshots/debug.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/screenshots/debug.png


--------------------------------------------------------------------------------
/screenshots/extracted.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/screenshots/extracted.jpg


--------------------------------------------------------------------------------
/screenshots/hep-th0401120-Figure-23-2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/screenshots/hep-th0401120-Figure-23-2x.png


--------------------------------------------------------------------------------
/screenshots/hep-th0401120-Figure-23-label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/screenshots/hep-th0401120-Figure-23-label.png


--------------------------------------------------------------------------------
/screenshots/hep-th0401120-Figure-23-prediction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/screenshots/hep-th0401120-Figure-23-prediction.png


--------------------------------------------------------------------------------
/screenshots/hep-th0401120-Figure-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/screenshots/hep-th0401120-Figure-23.png


--------------------------------------------------------------------------------
/screenshots/text-debug.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/screenshots/text-debug.png


--------------------------------------------------------------------------------
/testdata/figure.json:
--------------------------------------------------------------------------------
 1 | {"Type":"Figure",
 2 | "Number": 5,
 3 | "Page": 8,
 4 | "DPI": 100,
 5 | "Width": 850,
 6 | "Height": 1100,
 7 | "CaptionBB": [184,453,663,508],
 8 | "Caption": "Figure 5:   Transverse-to-the-wind component of the force as a function of time, for various small angular speeds of the rotating cylinder with R = 1, 000: (a) ω = 0.1; (b) ω = 0.2; (c) ω = 0.3; (d) ω = 0.4; and (e) ω = 0.5. After the transient stage, the Magnus force oscillates around an average value. ",
 9 | "ImageBB": [249,188,580,438],
10 | "ImageText" : [
11 | {"Rotation": 3,"TextBB": [251.402,290.861,264.54,361.89], "Text": "Transverse"},
12 | 	{"Rotation": 3,"TextBB": [251.402,258.492,264.54,286.912], "Text": "drag"},
13 | 	{"Rotation": 3,"TextBB": [251.402,222.97,264.54,254.544], "Text": "force"},
14 | 	{"Rotation": 0,"TextBB": [286.833,196.356,306.575,209.494], "Text": "1.4"},
15 | 	{"Rotation": 0,"TextBB": [286.833,220.502,306.575,233.64], "Text": "1.2"},
16 | 	{"Rotation": 0,"TextBB": [298.678,244.647,306.575,257.784], "Text": "1"},
17 | 	{"Rotation": 0,"TextBB": [286.833,268.792,306.575,281.93], "Text": "0.8"},
18 | 	{"Rotation": 0,"TextBB": [286.833,293.009,306.575,306.147], "Text": "0.6"},
19 | 	{"Rotation": 0,"TextBB": [476.228,299.045,483.329,312.183], "Text": "c"},
20 | 	{"Rotation": 0,"TextBB": [476.228,311.117,484.125,324.255], "Text": "b"},
21 | 	{"Rotation": 0,"TextBB": [476.228,329.226,484.125,342.364], "Text": "d"},
22 | 	{"Rotation": 0,"TextBB": [474.452,337.677,482.349,350.815], "Text": "a"},
23 | 	{"Rotation": 0,"TextBB": [286.833,317.154,306.575,330.291], "Text": "0.4"},
24 | 	{"Rotation": 0,"TextBB": [286.833,341.298,306.575,354.436], "Text": "0.2"},
25 | 	{"Rotation": 0,"TextBB": [298.678,365.444,306.575,378.582], "Text": "0"},
26 | 	{"Rotation": 0,"TextBB": [563.789,376.025,571.686,389.163], "Text": "e"},
27 | 	{"Rotation": 0,"TextBB": [282.103,389.588,306.575,402.726], "Text": "-0.2"},
28 | 	{"Rotation": 0,"TextBB": [313.121,403.792,321.018,416.93], "Text": "0"},
29 | 	{"Rotation": 0,"TextBB": [340.235,403.792,363.925,416.93], "Text": "100"},
30 | 	{"Rotation": 0,"TextBB": [375.246,403.792,398.936,416.93], "Text": "200"},
31 | 	{"Rotation": 0,"TextBB": [410.328,403.792,434.018,416.93], "Text": "300"},
32 | 	{"Rotation": 0,"TextBB": [445.337,403.792,469.028,416.93], "Text": "400"},
33 | 	{"Rotation": 0,"TextBB": [480.348,403.792,504.039,416.93], "Text": "500"},
34 | 	{"Rotation": 0,"TextBB": [515.358,403.792,539.049,416.93], "Text": "600"},
35 | 	{"Rotation": 0,"TextBB": [550.369,403.792,574.06,416.93], "Text": "700"},
36 | 	{"Rotation": 0,"TextBB": [430.622,425.097,462.181,438.234], "Text": "Time"}
37 | ]}
38 | 


--------------------------------------------------------------------------------
/testdata/figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testdata/figure.png


--------------------------------------------------------------------------------
/testdata/figure2.json:
--------------------------------------------------------------------------------
 1 | {"Type":"Figure",
 2 | "Number": 3,
 3 | "Page": 10,
 4 | "DPI": 100,
 5 | "Width": 850,
 6 | "Height": 1100,
 7 | "CaptionBB": [489,115,730,128],
 8 | "Caption": "Figure 3: Total increase in optimization time",
 9 | "ImageBB": [447,127,774,372],
10 | "ImageText" : [
11 | {"Rotation": 0,"TextBB": [490.517,132.755,510.95,144.008], "Text": "and"},
12 | 	{"Rotation": 0,"TextBB": [514.353,132.755,537.12,144.008], "Text": "time"},
13 | 	{"Rotation": 0,"TextBB": [540.523,132.755,570.577,144.008], "Text": "spent"},
14 | 	{"Rotation": 0,"TextBB": [573.861,132.755,583.293,144.008], "Text": "in"},
15 | 	{"Rotation": 0,"TextBB": [586.696,132.755,664.631,144.008], "Text": "view-matching"},
16 | 	{"Rotation": 0,"TextBB": [668.034,132.755,688.302,144.008], "Text": "rule"},
17 | 	{"Rotation": 3,"TextBB": [464.835,250.416,475.163,295.643], "Text": "Increase"},
18 | 	{"Rotation": 3,"TextBB": [464.835,237.303,475.163,247.292], "Text": "in"},
19 | 	{"Rotation": 3,"TextBB": [464.835,168.174,475.163,234.127], "Text": "optimization"},
20 | 	{"Rotation": 3,"TextBB": [479.5,220.274,489.828,243.314], "Text": "time"},
21 | 	{"Rotation": 0,"TextBB": [496.183,188.803,508.557,199.131], "Text": "50"},
22 | 	{"Rotation": 0,"TextBB": [496.183,207.968,508.557,218.296], "Text": "40"},
23 | 	{"Rotation": 0,"TextBB": [496.183,227.299,508.557,237.627], "Text": "30"},
24 | 	{"Rotation": 0,"TextBB": [496.183,246.464,508.557,256.792], "Text": "20"},
25 | 	{"Rotation": 0,"TextBB": [496.183,265.795,508.557,276.123], "Text": "10"},
26 | 	{"Rotation": 0,"TextBB": [502.349,284.96,508.557,295.288], "Text": "0"},
27 | 	{"Rotation": 0,"TextBB": [513.849,301.126,520.057,311.455], "Text": "0"},
28 | 	{"Rotation": 0,"TextBB": [496.183,169.472,508.557,179.8], "Text": "60"},
29 | 	{"Rotation": 0,"TextBB": [553.511,301.126,572.051,311.455], "Text": "200"},
30 | 	{"Rotation": 0,"TextBB": [599.34,301.126,617.879,311.455], "Text": "400"},
31 | 	{"Rotation": 0,"TextBB": [645.168,301.126,663.708,311.455], "Text": "600"},
32 | 	{"Rotation": 0,"TextBB": [690.997,301.126,709.536,311.455], "Text": "800"},
33 | 	{"Rotation": 0,"TextBB": [733.659,301.126,758.364,311.455], "Text": "1000"},
34 | 	{"Rotation": 0,"TextBB": [600.341,320.791,615.191,331.12], "Text": "No"},
35 | 	{"Rotation": 0,"TextBB": [618.367,320.791,628.873,331.12], "Text": "of"},
36 | 	{"Rotation": 0,"TextBB": [631.987,320.791,662.542,331.12], "Text": "views"},
37 | 	{"Rotation": 0,"TextBB": [522.181,351.454,546.948,361.782], "Text": "Total"},
38 | 	{"Rotation": 0,"TextBB": [550.136,351.454,592.373,361.782], "Text": "increase"},
39 | 	{"Rotation": 0,"TextBB": [629.005,351.454,653.231,361.782], "Text": "View"},
40 | 	{"Rotation": 0,"TextBB": [656.169,351.454,701.706,361.782], "Text": "matching"},
41 | 	{"Rotation": 0,"TextBB": [704.83,351.454,726.205,361.782], "Text": "time"}
42 | ]}
43 | 


--------------------------------------------------------------------------------
/testdata/figure2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testdata/figure2.png


--------------------------------------------------------------------------------
/testdata/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testdata/paper.pdf


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-0-2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-0-2x.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-0.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-1-2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-1-2x.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-1.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-2-2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-2-2x.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-2.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-3-2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-3-2x.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-3.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-4-2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-4-2x.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-4.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-5-2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-5-2x.png


--------------------------------------------------------------------------------
/testoutput/img/paper-Figure-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/img/paper-Figure-5.png


--------------------------------------------------------------------------------
/testoutput/json/paper-Figure-0.json:
--------------------------------------------------------------------------------
1 | {"ImageText": [{"Text": "Drag", "Rotation": 3, "TextBB": [193.705, 468.003, 211.227, 509.053]}, {"Text": "Force", "Rotation": 3, "TextBB": [193.705, 414.319, 211.227, 462.737]}, {"Text": "1.2", "Rotation": 0, "TextBB": [240.958, 317.446, 267.288, 334.968]}, {"Text": "1.1", "Rotation": 0, "TextBB": [240.958, 344.818, 267.288, 362.341]}, {"Text": "1", "Rotation": 0, "TextBB": [256.756, 372.191, 267.289, 389.713]}, {"Text": "0.9", "Rotation": 0, "TextBB": [240.958, 399.564, 267.288, 417.086]}, {"Text": "0.8", "Rotation": 0, "TextBB": [240.958, 426.936, 267.288, 444.459]}, {"Text": "0.7", "Rotation": 0, "TextBB": [240.958, 454.403, 267.288, 471.925]}, {"Text": "0.6", "Rotation": 0, "TextBB": [240.958, 481.775, 267.288, 499.298]}, {"Text": "0.5", "Rotation": 0, "TextBB": [240.958, 509.149, 267.288, 526.671]}, {"Text": "0.4", "Rotation": 0, "TextBB": [240.958, 536.521, 267.288, 554.043]}, {"Text": "0.3", "Rotation": 0, "TextBB": [240.958, 563.893, 267.288, 581.416]}, {"Text": "0.2", "Rotation": 0, "TextBB": [240.958, 591.266, 267.288, 608.788]}, {"Text": "0", "Rotation": 0, "TextBB": [276.02, 610.209, 286.553, 627.731]}, {"Text": "10", "Rotation": 0, "TextBB": [305.799, 610.209, 326.864, 627.731]}, {"Text": "20", "Rotation": 0, "TextBB": [340.844, 610.209, 361.908, 627.731]}, {"Text": "30", "Rotation": 0, "TextBB": [375.888, 610.209, 396.953, 627.731]}, {"Text": "40", "Rotation": 0, "TextBB": [410.931, 610.209, 431.996, 627.731]}, {"Text": "50", "Rotation": 0, "TextBB": [445.976, 610.209, 467.04, 627.731]}, {"Text": "60", "Rotation": 0, "TextBB": [480.926, 610.209, 501.99, 627.731]}, {"Text": "70", "Rotation": 0, "TextBB": [515.97, 610.209, 537.035, 627.731]}, {"Text": "80", "Rotation": 0, "TextBB": [551.015, 610.209, 572.079, 627.731]}, {"Text": "90", "Rotation": 0, "TextBB": [586.059, 610.209, 607.124, 627.731]}, {"Text": "100", "Rotation": 0, "TextBB": [615.837, 610.209, 647.434, 627.731]}], "Caption": "Figure 1: Drag force on a static cylinder in the wind tunnel initially switched off. It is turned on at t = 0 with a constant Reynolds number R = 1, 000, which defines the speed of the wind. The force in the direction perpendicular to the wind flow is negligible. The dimensionless drag force is plotted in arbitrary units, ignoring factors as the fluid density, etc. During one time unit the wind travels one cylinder diameter. The force increases, reaches a maximum value, decreases and finally stabilizes, corresponding to a dynamic situation where successive vortices appear continuously, slowly going away, one running clockwise, the next running counterclockwise and so on. The long von Ka\u0301rma\u0301n street is then formed. ", "ImageBB": [191, 315, 649, 655], "CaptionBB": [184, 672, 663, 779], "Number": 1, "Height": 1100, "Width": 850, "Type": "Figure", "Page": 4, "DPI": 100}


--------------------------------------------------------------------------------
/testoutput/json/paper-Figure-1.json:
--------------------------------------------------------------------------------
1 | {"ImageText": [{"Text": "b", "Rotation": 0, "TextBB": [501.98, 557.757, 504.978, 562.171]}], "Caption": "Figure 2:   Streamlines for Stokes\u2019s configuration (R \u2192 0), when the cylinder rotates clockwise with (a) \u03c9 = 0.1 and (b) \u03c9 = 0.5. The rotation of the cylinder results in the breakdown of the axial symmetry. As the angular speed increases, the lines are deformed, as expected. The wind goes from left to right. ", "ImageBB": [243, 496, 606, 656], "CaptionBB": [184, 667, 663, 722], "Number": 2, "Height": 1100, "Width": 850, "Type": "Figure", "Page": 5, "DPI": 100}


--------------------------------------------------------------------------------
/testoutput/json/paper-Figure-2.json:
--------------------------------------------------------------------------------
1 | {"ImageText": [{"Text": "b", "Rotation": 0, "TextBB": [359.135, 315.814, 362.983, 321.148]}, {"Text": "a", "Rotation": 0, "TextBB": [196.753, 317.531, 200.17, 321.148]}, {"Text": "c", "Rotation": 0, "TextBB": [521.253, 317.531, 524.67, 321.148]}], "Caption": "Figure 3:    Starting with Stokes\u2019s laminar configuration for the clockwise rotating cylinder, the wind is switched on at t = 0, with a fixed Reynolds number R = 1,000 and \u03c9 = 0.5. After a transient time, the velocity field changes continually and periodically. Figure (a) shows a vortex that has just formed behind the cylinder, rotating in the counterclockwise direction. It then moves away to the right. Some time later, another vortex is formed behind the cylinder, rotating in the clockwise direction, figure (b). Later, figure (c), the velocity field returns to the configuration observed in figure (a). The whole process is periodically repeated. At this fine scale near the cylinder back side, the same behavior is observed in the case of a fixed cylinder. However, behind this region there is a long von Ka\u0301rma\u0301n street of vortices, now bended downwards in this slowly rotating case here considered. ", "ImageBB": [183, 173, 665, 466], "CaptionBB": [184, 477, 663, 610], "Number": 3, "Height": 1100, "Width": 850, "Type": "Figure", "Page": 6, "DPI": 100}


--------------------------------------------------------------------------------
/testoutput/json/paper-Figure-3.json:
--------------------------------------------------------------------------------
1 | {"ImageText": [{"Text": "1", "Rotation": 0, "TextBB": [296.344, 284.617, 304.241, 297.755]}, {"Text": "a", "Rotation": 0, "TextBB": [554.426, 321.545, 562.323, 334.683]}, {"Text": "Force", "Rotation": 3, "TextBB": [249.067, 374.615, 262.205, 410.918]}, {"Text": "0.8", "Rotation": 0, "TextBB": [284.498, 325.665, 304.241, 338.803]}, {"Text": "0.6", "Rotation": 0, "TextBB": [284.498, 366.711, 304.241, 379.848]}, {"Text": "0.4", "Rotation": 0, "TextBB": [284.498, 407.829, 304.241, 420.966]}, {"Text": "b", "Rotation": 0, "TextBB": [554.426, 438.578, 562.323, 451.716]}, {"Text": "0.2", "Rotation": 0, "TextBB": [284.498, 448.876, 304.241, 462.014]}, {"Text": "0", "Rotation": 0, "TextBB": [296.344, 489.923, 304.241, 503.061]}, {"Text": "0", "Rotation": 0, "TextBB": [310.787, 504.126, 318.684, 517.264]}, {"Text": "100", "Rotation": 0, "TextBB": [355.443, 504.126, 379.134, 517.264]}, {"Text": "200", "Rotation": 0, "TextBB": [407.994, 504.126, 431.685, 517.264]}, {"Text": "300", "Rotation": 0, "TextBB": [460.473, 504.126, 484.164, 517.264]}, {"Text": "Time", "Rotation": 0, "TextBB": [428.289, 525.43, 459.848, 538.568]}, {"Text": "400", "Rotation": 0, "TextBB": [513.025, 504.126, 536.716, 517.264]}, {"Text": "500", "Rotation": 0, "TextBB": [565.576, 504.126, 589.267, 517.264]}], "Caption": "Figure 4: The drag force on a rotating cylinder inside the wind tunnel with Reynolds number R = 1,000, and \u03c9 = 0.1. The parallel-to-the-wind component of the drag force (a) shows the same behavior as before, in the case of fixed cylinder. During the transient stage, the transverse-to-the-wind component of the force (b) also increases, reaches a maximum, and decreases. However, this component does not stabilize. Instead, it oscillates around a mean value. The oscillation is due to the alternate formation of counterclockwise and clockwise vortices, being a signature of the von Ka\u0301rma\u0301n street. For small values of R, up to a few dozens, both components of the drag force don\u2019t oscillate, a signature of the absence of successive vortices. ", "ImageBB": [247, 281, 591, 538], "CaptionBB": [184, 554, 663, 674], "Number": 4, "Height": 1100, "Width": 850, "Type": "Figure", "Page": 7, "DPI": 100}


--------------------------------------------------------------------------------
/testoutput/json/paper-Figure-4.json:
--------------------------------------------------------------------------------
1 | {"ImageText": [{"Text": "Transverse", "Rotation": 3, "TextBB": [251.402, 290.861, 264.54, 361.89]}, {"Text": "drag", "Rotation": 3, "TextBB": [251.402, 258.492, 264.54, 286.912]}, {"Text": "force", "Rotation": 3, "TextBB": [251.402, 222.97, 264.54, 254.544]}, {"Text": "1.4", "Rotation": 0, "TextBB": [286.833, 196.356, 306.575, 209.494]}, {"Text": "1.2", "Rotation": 0, "TextBB": [286.833, 220.502, 306.575, 233.64]}, {"Text": "1", "Rotation": 0, "TextBB": [298.678, 244.647, 306.575, 257.784]}, {"Text": "0.8", "Rotation": 0, "TextBB": [286.833, 268.792, 306.575, 281.93]}, {"Text": "0.6", "Rotation": 0, "TextBB": [286.833, 293.009, 306.575, 306.147]}, {"Text": "c", "Rotation": 0, "TextBB": [476.228, 299.045, 483.329, 312.183]}, {"Text": "b", "Rotation": 0, "TextBB": [476.228, 311.117, 484.125, 324.255]}, {"Text": "d", "Rotation": 0, "TextBB": [476.228, 329.226, 484.125, 342.364]}, {"Text": "a", "Rotation": 0, "TextBB": [474.452, 337.677, 482.349, 350.815]}, {"Text": "0.4", "Rotation": 0, "TextBB": [286.833, 317.154, 306.575, 330.291]}, {"Text": "0.2", "Rotation": 0, "TextBB": [286.833, 341.298, 306.575, 354.436]}, {"Text": "0", "Rotation": 0, "TextBB": [298.678, 365.444, 306.575, 378.582]}, {"Text": "e", "Rotation": 0, "TextBB": [563.789, 376.025, 571.686, 389.163]}, {"Text": "-0.2", "Rotation": 0, "TextBB": [282.103, 389.588, 306.575, 402.726]}, {"Text": "0", "Rotation": 0, "TextBB": [313.121, 403.792, 321.018, 416.93]}, {"Text": "100", "Rotation": 0, "TextBB": [340.235, 403.792, 363.925, 416.93]}, {"Text": "200", "Rotation": 0, "TextBB": [375.246, 403.792, 398.936, 416.93]}, {"Text": "300", "Rotation": 0, "TextBB": [410.328, 403.792, 434.018, 416.93]}, {"Text": "400", "Rotation": 0, "TextBB": [445.337, 403.792, 469.028, 416.93]}, {"Text": "500", "Rotation": 0, "TextBB": [480.348, 403.792, 504.039, 416.93]}, {"Text": "600", "Rotation": 0, "TextBB": [515.358, 403.792, 539.049, 416.93]}, {"Text": "700", "Rotation": 0, "TextBB": [550.369, 403.792, 574.06, 416.93]}, {"Text": "Time", "Rotation": 0, "TextBB": [430.622, 425.097, 462.181, 438.234]}], "Caption": "Figure 5:   Transverse-to-the-wind component of the force as a function of time, for various small angular speeds of the rotating cylinder with R = 1, 000: (a) \u03c9 = 0.1; (b) \u03c9 = 0.2; (c) \u03c9 = 0.3; (d) \u03c9 = 0.4; and (e) \u03c9 = 0.5. After the transient stage, the Magnus force oscillates around an average value. ", "ImageBB": [249, 188, 580, 438], "CaptionBB": [184, 453, 663, 508], "Number": 5, "Height": 1100, "Width": 850, "Type": "Figure", "Page": 8, "DPI": 100}


--------------------------------------------------------------------------------
/testoutput/json/paper-Figure-5.json:
--------------------------------------------------------------------------------
1 | {"ImageText": [{"Text": "Transversal", "Rotation": 3, "TextBB": [216.749, 305.183, 232.518, 394.219]}, {"Text": "Forces", "Rotation": 3, "TextBB": [216.749, 248.349, 232.518, 300.444]}, {"Text": "20", "Rotation": 0, "TextBB": [253.785, 191.481, 272.741, 207.25]}, {"Text": "18", "Rotation": 0, "TextBB": [253.785, 216.114, 272.741, 231.883]}, {"Text": "16", "Rotation": 0, "TextBB": [253.785, 240.746, 272.741, 256.515]}, {"Text": "14", "Rotation": 0, "TextBB": [253.785, 265.38, 272.741, 281.148]}, {"Text": "12", "Rotation": 0, "TextBB": [253.785, 290.013, 272.741, 305.781]}, {"Text": "10", "Rotation": 0, "TextBB": [253.785, 314.645, 272.741, 330.413]}, {"Text": "8", "Rotation": 0, "TextBB": [263.263, 339.364, 272.741, 355.133]}, {"Text": "6", "Rotation": 0, "TextBB": [263.263, 363.996, 272.741, 379.765]}, {"Text": "4", "Rotation": 0, "TextBB": [263.263, 388.63, 272.741, 404.398]}, {"Text": "2", "Rotation": 0, "TextBB": [263.263, 413.262, 272.741, 429.03]}, {"Text": "0", "Rotation": 0, "TextBB": [263.263, 437.895, 272.741, 453.663]}, {"Text": "b", "Rotation": 0, "TextBB": [593.224, 245.35, 602.702, 261.119]}, {"Text": "a", "Rotation": 0, "TextBB": [593.224, 350.871, 602.702, 366.64]}, {"Text": "0", "Rotation": 0, "TextBB": [280.599, 454.942, 290.077, 470.711]}, {"Text": "100", "Rotation": 0, "TextBB": [322.006, 454.942, 350.44, 470.711]}, {"Text": "200", "Rotation": 0, "TextBB": [372.806, 454.942, 401.24, 470.711]}, {"Text": "300", "Rotation": 0, "TextBB": [423.692, 454.942, 452.126, 470.711]}, {"Text": "400", "Rotation": 0, "TextBB": [474.577, 454.942, 503.011, 470.711]}, {"Text": "500", "Rotation": 0, "TextBB": [525.461, 454.942, 553.896, 470.711]}, {"Text": "600", "Rotation": 0, "TextBB": [576.261, 454.942, 604.696, 470.711]}, {"Text": "Time", "Rotation": 0, "TextBB": [426.743, 480.513, 464.621, 496.281]}, {"Text": "12", "Rotation": 0, "TextBB": [253.785, 513.481, 272.741, 529.25]}, {"Text": "Longitudinal", "Rotation": 3, "TextBB": [216.749, 625.744, 232.518, 717.661]}, {"Text": "Forces", "Rotation": 3, "TextBB": [216.749, 568.909, 232.518, 621.005]}, {"Text": "10", "Rotation": 0, "TextBB": [253.785, 548.683, 272.741, 564.451]}, {"Text": "8", "Rotation": 0, "TextBB": [263.263, 583.885, 272.741, 599.654]}, {"Text": "6", "Rotation": 0, "TextBB": [263.263, 619.087, 272.741, 634.855]}, {"Text": "d", "Rotation": 0, "TextBB": [593.224, 646.618, 602.702, 662.386]}, {"Text": "4", "Rotation": 0, "TextBB": [263.263, 654.289, 272.741, 670.058]}, {"Text": "2", "Rotation": 0, "TextBB": [263.263, 689.491, 272.741, 705.259]}, {"Text": "c", "Rotation": 0, "TextBB": [593.224, 706.026, 601.747, 721.795]}, {"Text": "0", "Rotation": 0, "TextBB": [263.263, 724.694, 272.741, 740.462]}, {"Text": "-2", "Rotation": 0, "TextBB": [257.586, 759.895, 272.741, 775.663]}, {"Text": "0", "Rotation": 0, "TextBB": [280.599, 776.942, 290.077, 792.711]}, {"Text": "100", "Rotation": 0, "TextBB": [322.006, 776.942, 350.44, 792.711]}, {"Text": "200", "Rotation": 0, "TextBB": [372.806, 776.942, 401.24, 792.711]}, {"Text": "300", "Rotation": 0, "TextBB": [423.692, 776.942, 452.126, 792.711]}, {"Text": "400", "Rotation": 0, "TextBB": [474.577, 776.942, 503.011, 792.711]}, {"Text": "500", "Rotation": 0, "TextBB": [525.461, 776.942, 553.896, 792.711]}, {"Text": "600", "Rotation": 0, "TextBB": [576.261, 776.942, 604.696, 792.711]}], "Caption": "Figure 6:    On the top, we show the transverse-to-the-wind component of the drag force for \u03c9 = 2.5 (a) and \u03c9 = 4.5 (b). When \u03c9 \u2248 2.5, the oscillations are attenuated over time and tend to disappear, due to the absence of the von Ka\u0301rma\u0301n street. However, in (b), when \u03c9 \u2248 4.5, the oscillations (and the street) are present again. On the bottom, we show the parallel-to-the-wind component of the drag force for \u03c9 = 2.5 (c) and \u03c9 = 4.5 (d). Note that this component is very small for \u03c9 \u2248 2.5. But, in (d), when \u03c9 \u2248 4.5, as before, the flow is unstable again and the oscillations reappear around a larger average value. ", "ImageBB": [214, 188, 611, 817], "CaptionBB": [184, 834, 663, 928], "Number": 6, "Height": 1100, "Width": 850, "Type": "Figure", "Page": 10, "DPI": 100}


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-0-dbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-0-dbg.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-0-label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-0-label.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-1-dbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-1-dbg.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-1-label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-1-label.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-2-dbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-2-dbg.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-2-label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-2-label.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-3-dbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-3-dbg.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-3-label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-3-label.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-4-dbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-4-dbg.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-4-label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-4-label.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-5-dbg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-5-dbg.png


--------------------------------------------------------------------------------
/testoutput/text-masked/paper-Figure-5-label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/domoritz/label_generator/a684a4ab4ed46c977d6dd43f021e691ca043c34b/testoutput/text-masked/paper-Figure-5-label.png


--------------------------------------------------------------------------------