├── test
    ├── samples
    │   ├── 029.pdf
    │   ├── 001.pdf
    │   ├── 002.pdf
    │   ├── 003.pdf
    │   ├── 004.pdf
    │   ├── 005.pdf
    │   ├── 006.pdf
    │   ├── 007.pdf
    │   ├── 008.pdf
    │   ├── 009.pdf
    │   ├── 010.pdf
    │   ├── 011.pdf
    │   ├── 012.pdf
    │   ├── 013.pdf
    │   ├── 014.pdf
    │   ├── 016.pdf
    │   ├── 017.pdf
    │   ├── 018.pdf
    │   ├── 019.pdf
    │   ├── 020.pdf
    │   ├── 021.pdf
    │   ├── 022.pdf
    │   ├── 023.pdf
    │   ├── 024.pdf
    │   ├── 025.pdf
    │   ├── 026.pdf
    │   ├── 027.pdf
    │   ├── 028.pdf
    │   ├── 030.pdf
    │   ├── 031.pdf
    │   ├── 032.pdf
    │   └── 033.pdf
    ├── bashunit.bash
    └── run.sh
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
└── pdftitle


/test/samples/029.pdf:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | /_
3 | 


--------------------------------------------------------------------------------
/test/samples/001.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/001.pdf


--------------------------------------------------------------------------------
/test/samples/002.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/002.pdf


--------------------------------------------------------------------------------
/test/samples/003.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/003.pdf


--------------------------------------------------------------------------------
/test/samples/004.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/004.pdf


--------------------------------------------------------------------------------
/test/samples/005.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/005.pdf


--------------------------------------------------------------------------------
/test/samples/006.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/006.pdf


--------------------------------------------------------------------------------
/test/samples/007.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/007.pdf


--------------------------------------------------------------------------------
/test/samples/008.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/008.pdf


--------------------------------------------------------------------------------
/test/samples/009.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/009.pdf


--------------------------------------------------------------------------------
/test/samples/010.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/010.pdf


--------------------------------------------------------------------------------
/test/samples/011.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/011.pdf


--------------------------------------------------------------------------------
/test/samples/012.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/012.pdf


--------------------------------------------------------------------------------
/test/samples/013.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/013.pdf


--------------------------------------------------------------------------------
/test/samples/014.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/014.pdf


--------------------------------------------------------------------------------
/test/samples/016.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/016.pdf


--------------------------------------------------------------------------------
/test/samples/017.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/017.pdf


--------------------------------------------------------------------------------
/test/samples/018.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/018.pdf


--------------------------------------------------------------------------------
/test/samples/019.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/019.pdf


--------------------------------------------------------------------------------
/test/samples/020.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/020.pdf


--------------------------------------------------------------------------------
/test/samples/021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/021.pdf


--------------------------------------------------------------------------------
/test/samples/022.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/022.pdf


--------------------------------------------------------------------------------
/test/samples/023.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/023.pdf


--------------------------------------------------------------------------------
/test/samples/024.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/024.pdf


--------------------------------------------------------------------------------
/test/samples/025.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/025.pdf


--------------------------------------------------------------------------------
/test/samples/026.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/026.pdf


--------------------------------------------------------------------------------
/test/samples/027.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/027.pdf


--------------------------------------------------------------------------------
/test/samples/028.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/028.pdf


--------------------------------------------------------------------------------
/test/samples/030.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/030.pdf


--------------------------------------------------------------------------------
/test/samples/031.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/031.pdf


--------------------------------------------------------------------------------
/test/samples/032.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/032.pdf


--------------------------------------------------------------------------------
/test/samples/033.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/033.pdf


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: bash
 2 | 
 3 | sudo: false
 4 | 
 5 | addons:
 6 |   apt:
 7 |     packages:
 8 |       - poppler-utils
 9 | 
10 | script: test/run.sh -v
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, Uwe Dauernheim <uwe@dauernheim.net>
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met: 
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer. 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution. 
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 
24 | The views and conclusions contained in the software and documentation are those
25 | of the authors and should not be interpreted as representing official policies, 
26 | either expressed or implied, of the FreeBSD Project.
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pdftitle
  2 | 
  3 | The commandline tool `pdftitle` is a Python implementation of the
  4 | *SciPlore Xtract*[1] paper, using mostly a structural layout analysis.
  5 | 
  6 | By now, Docear has published the open-source tool
  7 | [PDF Inspector](https://github.com/Docear/PDF-Inspector) which does roughly the
  8 | same as this script. The differences are:
  9 | 
 10 | - Written in Java
 11 | - Uses ~~PDFBox~~ *jPod* instead of *pdftohtml*
 12 | - Simplier heuristics
 13 | 
 14 | > [1] *Joeran Beel, Bela Gipp, Ammar Shaker, and Nick Friedrich*.
 15 | > [SciPlore Xtract: Extracting Titles from Scientific PDF documents by Analyzing
 16 | > Style Information (Font Size)](http://docear.org/papers/SciPlore%20Xtract%20--%20Extracting%20Titles%20from%20Scientific%20PDF%20Documents%20by%20Analyzing%20Style%20Information%20%28Font%20Size%29-preprint.pdf).
 17 | > In M. Lalmas, J. Jose, A. Rauber, F. Sebastiani, and I. Frommholz, editors,
 18 | > Research and Advanced Technology for Digital Libraries, Proceedings of the
 19 | > 14th European Conference on Digital Libraries (ECDL-10), volume 6273 of
 20 | > Lecture Notes of Computer Science (LNCS), pages 413-416, Glasgow (UK),
 21 | > September 2010. Springer.
 22 | 
 23 | ![Travis CI Status](https://travis-ci.org/djui/pdftitle.svg)
 24 | 
 25 | ## Background
 26 | 
 27 | The title of a PDF article usually is in the filename but often is not. Next up
 28 | would be to check the title of the PDF metadata (using e.g. `pdfinfo`) but this
 29 | is also often not set or set incorrectly. Converting the PDF to text and picking
 30 | the first line often gives false positives or incomplete titles.
 31 | 
 32 | ## Usage
 33 | 
 34 |     $ pdftitle --help
 35 |     usage: pdftitle [-h] [-r] [-m] [-s] [-t TOP_MARGIN] [-n MIN_LENGTH] [-x MAX_LENGTH] [-d] [-v] FILE
 36 | 
 37 |     Tries to identify the title of PDF format paper.
 38 | 
 39 |     positional arguments:
 40 |       FILE                  Path to PDF file
 41 | 
 42 |     optional arguments:
 43 |       -h, --help            show this help message and exit
 44 |       -r, --rename          Rename file with found title
 45 |       -m, --multiline       Concatenate multiple title lines considered (default)
 46 |       -s, --singleline      Only use first title line considered
 47 |       -t TOP_MARGIN, --top-margin TOP_MARGIN
 48 |                             Top margin start to search for title (default: 70)
 49 |       -n MIN_LENGTH, --min-length MIN_LENGTH
 50 |                             Min. considerable title length (default: 15)
 51 |       -x MAX_LENGTH, --max-length MAX_LENGTH
 52 |                             Max. considerable title length (default: 250)
 53 |       -d, --debug           Print error stacktrace for unknown errors
 54 |       -v, --version         show program's version number and exit
 55 | 
 56 | 
 57 | ## Dependencies
 58 | 
 59 |   * Python >=2.5
 60 |   * [Poppler](http://poppler.freedesktop.org/) >=0.20.5 (contains `pdftohtml`)
 61 | 
 62 |       `$ brew install poppler`
 63 | 
 64 |   * [lxml](http://lxml.de/) (optional, for higher accuracy)
 65 | 
 66 |       `$ pip install lxml`
 67 | 
 68 | 
 69 | ## Accuracy
 70 | 
 71 | Version 1.0: A sample set of 261 PDFs in Biology science (which has many
 72 | scanned PDFs) results in 60.08% success rate.
 73 | 
 74 | Version 1.1: A sample set of 261 PDFs in Biology science (which has many
 75 | scanned PDFs) results in 76.25% success rate.
 76 | 
 77 | Version 1.2: No comparison available. (I lost the original sample set)
 78 | 
 79 | Version 1.3: No comparison available. (I lost the original sample set)
 80 | 
 81 | 
 82 | ## Contributing
 83 | 
 84 | ### Testing
 85 | 
 86 |     $ ./test/run.sh -v
 87 | 
 88 | 
 89 | ## Todos
 90 | 
 91 | **Version 2.0**: I will likely switch from Poppler/pdftohtml to PDFBox (or JPod)
 92 | to have no external dependencies. This will likely convert the script into a
 93 | Java CLI application. I was tinkering with a Go/Rust version (as bindings to
 94 | Poppler similar to [Go-Poppler](https://github.com/cheggaaa/go-poppler)) Let's
 95 | see.
 96 | 
 97 | 
 98 | ## License
 99 | 
100 | `pdftitle` is licenced under a
101 | [BSD License](https://github.com/djui/pdftitle/blob/master/LICENSE).
102 | 


--------------------------------------------------------------------------------
/test/bashunit.bash:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | ########################################################################
  4 | # USAGE
  5 | ########################################################################
  6 | 
  7 | # Functions starting with 'test' will be automatically evaluated.
  8 | #
  9 | # 1. Write test cases
 10 | #
 11 | #     :
 12 | #     testEcho() {
 13 | #         assertEqual "$(echo foo)" "foo"
 14 | #         assertReturn "$(echo foo)" 0
 15 | #     }
 16 | #     :
 17 | #
 18 | # 2. Include this script at the end of your test script
 19 | #
 20 | #    :
 21 | #    source $(dirname $0)/bashunit.bash
 22 | #    # eof
 23 | #
 24 | # 3. Run test suite
 25 | #
 26 | #    $ ./test_example
 27 | #    testEcho:4:Passed
 28 | #    testEcho:5:Passed
 29 | #    Done. 2 passed. 0 failed. 0 skipped.
 30 | #
 31 | # The return code is equal to the amount of failed testcases.
 32 | #
 33 | # Options can be given to the test script:
 34 | #
 35 | # $ bash ./bashunit.bash
 36 | # Usage: <testscript> [options...]
 37 | #
 38 | # Options:
 39 | #   -v, --verbose  Print exptected and provided values
 40 | #   -s, --summary  Only print summary omitting individual test results
 41 | #   -q, --quiet    Do not print anything to standard output
 42 | #   -h, --help     Show usage screen
 43 | 
 44 | ########################################################################
 45 | # DEPENDENCIES
 46 | ########################################################################
 47 | 
 48 | # * Bash (BASH_LINENO)
 49 | # * Shell colours
 50 | 
 51 | ########################################################################
 52 | # API
 53 | ########################################################################
 54 | 
 55 | # * assertEqual($1, $2)
 56 | #     $1: Output
 57 | #     $2: Expected
 58 | #
 59 | #     Assert that a given output string is equal to an expected string.
 60 | 
 61 | # * assertNotEqual($1, $2)
 62 | #     $1: Output
 63 | #     $2: Expected
 64 | #
 65 | #     Assert that a given output string is not equal to an expected
 66 | #     string.
 67 | 
 68 | # * assertStartsWith($1, $2)
 69 | #     $1: Output
 70 | #     $2: Expected
 71 | #
 72 | #     Assert that a given output string starts with an expected string.
 73 | 
 74 | # * assertReturn($1, $2)
 75 | #     $1: Output
 76 | #     $2: Expected
 77 | #     $?: Provided
 78 | #
 79 | #     Assert that the last command's return code is equal to an expected
 80 | #     integer.
 81 | 
 82 | # * assertNotReturn($1, $2)
 83 | #     $1: Output
 84 | #     $2: Expected
 85 | #     $?: Provided
 86 | #
 87 | #     Assert that the last command's return code is not equal to an
 88 | #     expected integer.
 89 | 
 90 | # * skip()
 91 | #
 92 | #     Skip the current test case.
 93 | 
 94 | ########################################################################
 95 | # GLOBALS
 96 | ########################################################################
 97 | 
 98 | verbose=2
 99 | 
100 | bashunit_passed=0
101 | bashunit_failed=0
102 | bashunit_skipped=0
103 | 
104 | ########################################################################
105 | # ASSERT FUNCTIONS
106 | ########################################################################
107 | 
108 | # $1: Output
109 | # $2: Expected
110 | assertEqual() {
111 |     echo $1 | grep -E "^$2$" > /dev/null
112 |     if [ $? -eq 0 ] ; then _passed ; else _failed "$1" "$2" ; fi
113 | }
114 | 
115 | # $1: Output
116 | # $2: Expected
117 | assertNotEqual() {
118 |     echo $1 | grep -E "^$2$" > /dev/null
119 |     if [ $? -ne 0 ] ; then _passed ; else _failed "$1" "$2" ; fi
120 | }
121 | 
122 | # $1: Output
123 | # $2: Expected
124 | assertStartsWith() {
125 |     echo $1 | grep -E "^$2" > /dev/null
126 |     if [ $? -eq 0 ] ; then _passed ; else _failed "$1" "$2" ; fi
127 | }
128 | 
129 | # $1: Output
130 | # $2: Expected
131 | # $?: Provided
132 | assertReturn() {
133 |     local code=$?
134 |     if [ $code -eq $2 ] ; then _passed ; else _failed "$code" "$2" ; fi
135 | }
136 | 
137 | # $1: Output
138 | # $2: Expected
139 | # $?: Provided
140 | assertNotReturn() {
141 |     local code=$?
142 |     if [ $code -ne $2 ] ; then _passed ; else _failed "$code" "$2" ; fi
143 | }
144 | 
145 | skip() {
146 |     _skipped
147 | }
148 | 
149 | _failed() {
150 |     bashunit_failed=$((bashunit_failed+1))
151 | 
152 |     local tc=${FUNCNAME[2]}
153 |     local line=${BASH_LINENO[1]}
154 |     if [ $verbose -ge 2 ] ; then
155 |         echo -e "\033[37;1m$tc\033[0m:$line:\033[31mFailed\033[0m"
156 |     fi
157 |     if [ $verbose -eq 3 ] ; then
158 |         echo -e "\033[31mExpected\033[0m: $2"
159 |         echo -e "\033[31mProvided\033[0m: $1"
160 |     fi
161 | }
162 | 
163 | _passed() {
164 |     bashunit_passed=$((bashunit_passed+1))
165 | 
166 |     local tc=${FUNCNAME[2]}
167 |     local line=${BASH_LINENO[1]}
168 |     if [ $verbose -ge 2 ] ; then
169 |         echo -e "\033[37;1m$tc\033[0m:$line:\033[32mPassed\033[0m"
170 |     fi
171 | }
172 | 
173 | _skipped() {
174 |     bashunit_skipped=$((bashunit_skipped+1))
175 | 
176 |     local tc=${FUNCNAME[2]}
177 |     local line=${BASH_LINENO[1]}
178 |     if [ $verbose -ge 2 ] ; then
179 |         echo -e "\033[37;1m$tc\033[0m:$line:\033[33mSkipped\033[0m"
180 |     fi
181 | }
182 | 
183 | ########################################################################
184 | # RUN
185 | ########################################################################
186 | 
187 | usage() {
188 |     echo "Usage: <testscript> [options...]"
189 |     echo
190 |     echo "Options:"
191 |     echo "  -v, --verbose  Print exptected and provided values"
192 |     echo "  -s, --summary  Only print summary omitting individual test results"
193 |     echo "  -q, --quiet    Do not print anything to standard output"
194 |     echo "  -h, --help     Show usage screen"
195 | }
196 | 
197 | runTests() {
198 |     local test_pattern="test[a-zA-Z0-9_]\+"
199 |     local testcases=$(grep "^ *\(function \)*$test_pattern *\\(\\)" $0 | \
200 |         grep -o $test_pattern)
201 | 
202 |     if [ ! "${testcases[*]}" ] ; then
203 |         usage
204 |         exit 0
205 |     fi
206 | 
207 |     for tc in $testcases ; do $tc ; done
208 | 
209 |     if [ $verbose -ge 1 ] ; then
210 |         echo "Done. $bashunit_passed passed." \
211 |              "$bashunit_failed failed." \
212 |              "$bashunit_skipped skipped."
213 |     fi
214 |     exit $bashunit_failed
215 | }
216 | 
217 | # Arguments
218 | while [ $# -gt 0 ]; do
219 |     arg=$1; shift
220 |     case $arg in
221 |         "-v"|"--verbose") verbose=3;;
222 |         "-s"|"--summary") verbose=1;;
223 |         "-q"|"--quiet")   verbose=0;;
224 |         "-h"|"--help")    usage; exit 0;;
225 |         *) shift;;
226 |     esac
227 | done
228 | 
229 | runTests
230 | 


--------------------------------------------------------------------------------
/test/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | BASEDIR="$(dirname $0)/.."
  4 | 
  5 | cd "$BASEDIR"
  6 | 
  7 | ########################################################################
  8 | # Argument tests
  9 | ########################################################################
 10 | 
 11 | testPositiveArgumentsHelp() {
 12 |     assertReturn "$(./pdftitle -h)" 0
 13 |     assertReturn "$(./pdftitle --help)" 0
 14 | }
 15 | 
 16 | testPositiveArgumentsVersion() {
 17 |     assertReturn "$(./pdftitle -v 2> /dev/null)" 0
 18 |     assertReturn "$(./pdftitle --version 2> /dev/null)" 0
 19 |     assertEqual "$(./pdftitle -v 2>&1)" "pdftitle 1.3"
 20 |     assertEqual "$(./pdftitle --version 2>&1)" "pdftitle 1.3"
 21 | }
 22 | 
 23 | ########################################################################
 24 | # Return value tests
 25 | ########################################################################
 26 | 
 27 | testNegativeNotFound() {
 28 |     assertEqual "$(./pdftitle ./test/samples/026.pdf 2>&1)" \
 29 |         "Error: No title found"
 30 |     assertReturn "$(./pdftitle ./test/samples/026.pdf 2> /dev/null)" 1
 31 | }
 32 | 
 33 | testNegativeArgumentsNoFile() {
 34 |     assertReturn "$(./pdftitle 2> /dev/null)" 2
 35 | }
 36 | 
 37 | testNegativeArgumentsUnknown() {
 38 |     assertReturn "$(./pdftitle --foo foobar 2> /dev/null)" 2
 39 | }
 40 | 
 41 | testNegativeArgumentsInvalid() {
 42 |     assertReturn "$(./pdftitle -t -1 foobar 2> /dev/null)" 2
 43 |     assertReturn "$(./pdftitle --top -1 foobar 2> /dev/null)" 2
 44 | }
 45 | 
 46 | testNegativeFileNotFound() {
 47 |     assertReturn "$(./pdftitle foobar 2> /dev/null)" 2
 48 |     # Skip reason: Too verbose
 49 |     skip assertEqual "$(./pdftitle foobar 2>&1)" "pdftitle: error: argument FILE: invalid filepath value: 'foobar'"
 50 | }
 51 | 
 52 | testNegativePDFTOHTMLNotFound() {
 53 |     # Skip reason: Hard to simulate
 54 |     skip assertReturn "$(./pdftitle foobar 2> /dev/null)" 4
 55 | }
 56 | 
 57 | testNegativeCouldNotConvertPDFToXML() {
 58 |     assertReturn "$(./pdftitle ./test/samples/029.pdf 2> /dev/null)" 5
 59 |     assertEqual "$(./pdftitle ./test/samples/029.pdf 2>&1)" \
 60 |         "Error: Could not convert PDF to XML"
 61 | }
 62 | 
 63 | testNegativePasswordProtected() {
 64 |     assertReturn "$(./pdftitle ./test/samples/027.pdf 2> /dev/null)" 5
 65 |     assertEqual "$(./pdftitle ./test/samples/027.pdf 2>&1)" \
 66 |         "Error: Could not convert PDF to XML"
 67 | }
 68 | 
 69 | testNegativeCouldNotParseXML() {
 70 |     # Skip reason: Does not fail under lxml
 71 |     skip assertReturn "$(./pdftitle ./test/samples/025.pdf 2> /dev/null)" 6
 72 |     skip assertEqual "$(./pdftitle ./test/samples/025.pdf 2>&1)" \
 73 |         "Error: Could not parse XML"
 74 | }
 75 | 
 76 | testNegativeUnknownError() {
 77 |     # Skip reason: Hard to simulate
 78 |     skip assertReturn "$(./pdftitle foobar 2> /dev/null)" 8
 79 | }
 80 | 
 81 | ########################################################################
 82 | # Misc tests
 83 | ########################################################################
 84 | 
 85 | testSimpleCorrectUndistracted() {
 86 |     assertEqual "$(./pdftitle ./test/samples/001.pdf)" \
 87 |         "The Ecology of West Nile Virus in South Africa and the Occurrence of Outbreaks in Humans"
 88 | }
 89 | 
 90 | testSimpleMultiline() {
 91 |     assertEqual "$(./pdftitle ./test/samples/009.pdf)" \
 92 |         "The Ecology of West Nile Virus in South Africa and the Occurrence of Outbreaks in Humans"
 93 | }
 94 | 
 95 | testTwoColumns() {
 96 |     assertEqual "$(./pdftitle ./test/samples/002.pdf)" \
 97 |         "Outbreak of West Nile Virus Infection in Greece, 2010"
 98 | }
 99 | 
100 | testNotHighestFont() {
101 |     # Skip reason: Skips lines
102 |     skip assertEqual "$(./pdftitle ./test/samples/011.pdf)" \
103 |         "Genetic Differences Between Culex pipiens f\. molestus and Culex pipiens pipiens \(Diptera: Culicidae\) in New York"
104 | }
105 | 
106 | testMixedFormatting() {
107 |     # Skip reason: Skips lines
108 |     skip assertEqual "$(./pdftitle ./test/samples/011.pdf)" \
109 |         "Genetic Differences Between Culex pipiens f\. molestus and Culex pipiens pipiens \(Diptera: Culicidae\) in New York"
110 | }
111 | 
112 | testSlightDifferentSizeForSameFont() {
113 |     # Skip reason: Skips lines
114 |     skip assertEqual "$(./pdftitle ./test/samples/019.pdf)" \
115 |         "HpaII endonuclease distinguishes between two species in the Anopheles funestus group"
116 | }
117 | 
118 | testHiddenText() {
119 |     # Skip reason: Skips lines
120 |     skip assertEqual "$(./pdftitle ./test/samples/023.pdf)" \
121 |         "Phylogeny of fourteen Culex mosquito species, including the Culex pipiens complex, inferred from the internal transcribed spacers of ribosomal DNA"
122 | }
123 | 
124 | testInvalidXML() {
125 |     assertEqual "$(./pdftitle ./test/samples/024.pdf 2> /dev/null)" \
126 |         "Spread of the West Nile virus vector Culex modestus and the potential malaria vector Anopheles hyrcanus in central Europe"
127 | }
128 | 
129 | testCopyProtected() {
130 |     assertEqual "$(./pdftitle ./test/samples/028.pdf)" \
131 |         "Pogosta Disease: Clinical Observations During An Outbreak In The Province Of North Karelia, Finland"
132 | }
133 | 
134 | ########################################################################
135 | # Filter tests
136 | ########################################################################
137 | 
138 | testFilterImageAbove() {
139 |     skip assertEqual "$(./pdftitle ./test/samples/003.pdf)" \
140 |         "A Comparison of Gravid and Under-House CO2-Baited CDC Light Traps for Mosquito Species of Public Health Importance in Houston, Texas"
141 | }
142 | 
143 | testFilterNotLargestFound() {
144 |     assertEqual "$(./pdftitle ./test/samples/005.pdf)" \
145 |         "Asymmetric introgression between sympatric molestus and pipiens forms of Culex pipiens \(Diptera: Culicidae\) in the Comporta region, Portugal"
146 | }
147 | 
148 | testFilterTooShort() {
149 |     assertEqual "$(./pdftitle ./test/samples/006.pdf)" \
150 |         "West Nile Fever in Czechland"
151 | }
152 | 
153 | testFilterTooLong() {
154 |     assertEqual "$(./pdftitle ./test/samples/020.pdf)" \
155 |         "A study of mosquito fauna \(Diptera: Culicidae\) and the phenology of the species recorded in Wilanów \(Warsaw, Poland\)"
156 | }
157 | 
158 | testFilterVertical() {
159 |     assertEqual "$(./pdftitle ./test/samples/031.pdf)" \
160 |         "Stratified B-trees and versioning dictionaries"
161 | }
162 | 
163 | testFilterTooFarDistanceWithEqualFont() {
164 |     assertEqual "$(./pdftitle ./test/samples/016.pdf)" \
165 |         "Mosquito species distribution in mainland Portugal 2005-2008"
166 | }
167 | 
168 | testFilterOnLowerHalfOfFirstPage() {
169 |     assertEqual "$(./pdftitle ./test/samples/021.pdf)" \
170 |         "Scandinavian Journal of Rheumatology"
171 | }
172 | 
173 | testFilterInitialCapital() {
174 |     assertEqual "$(./pdftitle ./test/samples/022.pdf)" \
175 |         "Syndromic Surveillance in The Netherlands for the Early Detection of West Nile Virus Epidemics"
176 | }
177 | 
178 | ########################################################################
179 | # Formatter tests
180 | ########################################################################
181 | 
182 | testFormatUpperCase() {
183 |     assertEqual "$(./pdftitle ./test/samples/004.pdf)" \
184 |         "Influence Of Landscape Structure On Mosquitoes \(Diptera: Culicidae\) And Dytiscids \(Coleoptera: Dytiscidae\) At Five Spatial Scales In Swedish Wetlands"
185 | }
186 | 
187 | testFormatWeirdCase() {
188 |     assertEqual "$(./pdftitle ./test/samples/030.pdf)" \
189 |         "Atomic Broadcast: A Fault-Tolerant Token Based Algorithm And Performance Evaluations"
190 | }
191 | 
192 | testFormatSpaceCase() {
193 |     # Skip reason: Word boundaries swallowed
194 |     skip assertEqual "$(./pdftitle ./test/samples/033.pdf)" \
195 |         "A High-Level Framework for Distributed Processing of Large-Scale Graphs"
196 | }
197 | 
198 | testFormatUpperCaseMixedFormatting() {
199 |     assertEqual "$(./pdftitle ./test/samples/008.pdf)" \
200 |         "Evaluation Of Six Mosquito Traps For Collection Of Aedes Albopictus And Associated Mosquito Species In A Suburban Setting In North Central Florida"
201 | }
202 | 
203 | testFormatSubscript() {
204 |     # Skip reason: Chemical expression incorrectly handled
205 |     skip assertEqual "$(./pdftitle ./test/samples/010.pdf)" \
206 |         "Validation Of CO2 Trap Data In Three European Regions"
207 | }
208 | 
209 | testFormatLinebreakDash() {
210 |     assertEqual "$(./pdftitle ./test/samples/012.pdf)" \
211 |         "The prevalence of antibodies against Sindbis-related \(Pogosta\) virus in different parts of Finland"
212 | }
213 | 
214 | testFormatPeriod() {
215 |     assertEqual "$(./pdftitle ./test/samples/031.pdf)" \
216 |         "Stratified B-trees and versioning dictionaries"
217 | }
218 | 
219 | testFormatAsterik() {
220 |     assertEqual "$(./pdftitle ./test/samples/032.pdf)" \
221 |         "Ecophysiological and morphological variations in mosquitoes of the Culex pipiens complex \(Diptera: Culicidae\)"
222 | }
223 | 
224 | testFormatLigatures() {
225 |     # Skip reason: pdftohtml produces gibberish
226 |     skip assertEqual "$(./pdftitle ./test/samples/017.pdf)" \
227 |         "Mitochondrial DNA cytochrome oxidase I gene: potential for distinction between immature stages of some forensically important fly species \(Diptera\) in western Australia"
228 | }
229 | 
230 | testFormatQuotes() {
231 |     # Skip reason: Quotes are not correctly detected
232 |     skip assertEqual "$(./pdftitle ./test/samples/013.pdf)" \
233 |         "“Coi”-like Sequences Are Becoming Problematic In Molecular Systematic And Dna Barcoding Studies"
234 | }
235 | 
236 | testFormatMultipleSpaces() {
237 |     # Skip reason: No good pdf found to test the feature
238 |     skip assertEqual "$(./pdftitle ./test/samples/0XX.pdf 2> /dev/null)" ""
239 | }
240 | 
241 | testFormatFormattingSwitchOnNewLine() {
242 |     # Skip reason: Skips a word
243 |     skip assertEqual "$(./pdftitle ./test/samples/014.pdf)" \
244 |         "Rapid Assay To Identify The Two Genetic Forms Of Culex \(Culex\) Pipiens L\. \(Diptera: Culicidae\) And Hybrid Populations"
245 | }
246 | 
247 | testFormatTwoCharacterEncoding() {
248 |     skip assertEqual "$(./pdftitle ./test/samples/018.pdf)" \
249 |         "Serological Examination of Songbirds \(Passeriformes\) for Mosquito-Borne Viruses Sindbis, Ťahynǎ, and Batai in a South Moravian Wetland \(Czech Republic\)"
250 | }
251 | 
252 | ########################################################################
253 | 
254 | source "./test/bashunit.bash"
255 | 


--------------------------------------------------------------------------------
/pdftitle:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | from __future__ import print_function
  4 | 
  5 | import argparse
  6 | import copy
  7 | import os
  8 | import re
  9 | import subprocess
 10 | import sys
 11 | import traceback
 12 | from cStringIO import StringIO
 13 | try:
 14 |   from lxml import etree
 15 |   parse_xml = lambda s: etree.parse(s, etree.XMLParser(recover=True))
 16 | except ImportError:
 17 |   import xml.etree.ElementTree as etree
 18 |   parse_xml = lambda s: etree.parse(s)
 19 | 
 20 | 
 21 | VERSION = '1.3'
 22 | 
 23 | 
 24 | def convert_pdf_to_xml(path):
 25 |   """Return XML string of converted PDF file."""
 26 |   cmd = ['pdftohtml', '-xml', '-f', '1', '-l', '1', '-i', '-q', '-nodrm', '-hidden', '-stdout', path]
 27 |   xml_string = subprocess.check_output(cmd, stderr=open(os.devnull, 'w'))
 28 |   return parse_xml(StringIO(remove_control_chars(xml_string)))
 29 | 
 30 | 
 31 | def remove_control_chars(string):
 32 |   """Filter ASCII control characters as etree treats them as invalid."""
 33 |   return ''.join([i for i in string if ord(i) in [9, 10, 13] or ord(i) >= 32])
 34 | 
 35 | 
 36 | def font_specs(xml_data):
 37 |   """Return all font specifications in XML."""
 38 |   xml_font_specs = xml_data.findall('page[@number="1"]/fontspec[@id][@size]')
 39 |   return [fs.attrib for fs in xml_font_specs]
 40 | 
 41 | 
 42 | def sorted_font_ids(font_specs):
 43 |   """Return sorted font specifications by size decending."""
 44 |   font_specs = sorted(font_specs, key=lambda x: int(x['size']), reverse=True)
 45 |   return [fs['id'] for fs in font_specs]
 46 | 
 47 | 
 48 | def textblocks_by_id(xml_data, font_id):
 49 |   """Return text blocks given font id."""
 50 |   text_elements = xml_data.findall('page[@number="1"]/text[@font="%s"]' % font_id)
 51 |   first_page_top = int(xml_data.findall('page[@number="1"]')[0].get('top'))
 52 |   first_page_height = int(xml_data.findall('page[@number="1"]')[0].get('height'))
 53 |   return top_and_texts(text_elements, first_page_top, first_page_height)
 54 | 
 55 | 
 56 | def top_and_texts(text_elements, page_top, page_height):
 57 |   """Return top position of first non-empty text line and all
 58 |   unformatted non-empty text lines, and some extra (page) metadata.
 59 |   Example: {
 60 |     'pageTop': 0,
 61 |     'pageHeight': 1263,
 62 |     'blockTop': 16,
 63 |     'blockText': [
 64 |       {'top': 16, 'height': 24, 'text': 'foo'},
 65 |       {'top': 30, 'height': 24, 'text': 'bar'},
 66 |       {'top': 44, 'height': 24, 'text': 'baz'}
 67 |     ]
 68 |   }"""
 69 |   text_lines = []
 70 |   top = page_top
 71 | 
 72 |   for text_element in text_elements:
 73 |     text_line = unformat_and_strip(text_element)
 74 |     if not text_line:
 75 |       continue
 76 |     t = int(text_element.get('top'))
 77 |     h = int(text_element.get('height'))
 78 |     w = int(text_element.get('width'))
 79 |     # TODO: Maybe allow a light error here
 80 |     # if T < Top - Error:
 81 |     # TODO: This is actually a filter
 82 |     if t < top:
 83 |       # Ignore text lines positioned upwards. Only look downwards.
 84 |       continue
 85 |     top = t
 86 |     text_lines.append({
 87 |       'top': t,
 88 |       'height': h,
 89 |       'width': w,
 90 |       'text': text_line
 91 |     })
 92 | 
 93 |   if text_lines and top > page_top:
 94 |     return {
 95 |       'pageTop': page_top,
 96 |       'pageHeight': page_height,
 97 |       'blockTop': min(text_lines, key=lambda x: x['top'])['top'],
 98 |       'blockText': text_lines
 99 |     }
100 |   else:
101 |     return {}
102 | 
103 | 
104 | def filter_empties(text_blocks, _config):
105 |   """Filter emtpy text blocks."""
106 |   return [tb for tb in text_blocks if tb and tb['blockText']]
107 | 
108 | 
109 | def unformat_and_strip(text_element):
110 |   """Return non-empty unformatted text element."""
111 |   return ''.join(text_element.itertext()).strip()
112 | 
113 | 
114 | def filter_bottom_half(text_blocks, _config):
115 |   """Filter text blocks on lower half of page."""
116 |   return [tb for tb in text_blocks if
117 |     tb['blockTop'] - tb['pageTop'] < tb['pageHeight'] / 2]
118 | 
119 | 
120 | def filter_margin(text_blocks, config):
121 |   """Filter text blocks above certain top margin."""
122 |   return [tb for tb in text_blocks if tb['blockTop'] > config.top_margin]
123 | 
124 | 
125 | def filter_vertical(text_blocks, _config):
126 |   """Filter text blocks with vertical text."""
127 |   new_text_blocks = []
128 |   for tb in text_blocks:
129 |     new_tb = copy.copy(tb)
130 |     new_tb['blockText'] = []
131 |     for t in tb['blockText']:
132 |       if t['width'] > 0:
133 |         new_tb['blockText'].append(t)
134 |     if new_tb['blockText']:
135 |       new_text_blocks.append(new_tb)
136 |   return new_text_blocks
137 | 
138 | 
139 | def filter_shorts(text_blocks, config):
140 |   """Filter text lines which are too short thus unlikely titles."""
141 |   return [tb for tb in text_blocks if
142 |     len(' '.join([t['text'] for t in tb['blockText']])) >= config.min_length]
143 | 
144 | 
145 | def filter_longs(text_blocks, config):
146 |   """Filter text lines which are too long thus unlikely titles."""
147 |   return [tb for tb in text_blocks if
148 |     len(' '.join([t['text'] for t in tb['blockText']])) <= config.max_length]
149 | 
150 | 
151 | def filter_unrelated_lines(text_blocks, _config):
152 |   """Filter text lines in text blocks that are too far away from previous
153 |   lines."""
154 |   new_text_blocks = []
155 |   for tb in text_blocks:
156 |     new_tb = copy.copy(tb)
157 |     new_tb['blockText'] = []
158 |     next_top = tb['blockTop']
159 |     for t in tb['blockText']:
160 |       if t['top'] < next_top + t['height'] / 2:
161 |         next_top = t['top'] + t['height']
162 |         new_tb['blockText'].append(t)
163 |     if new_tb['blockText']:
164 |       new_text_blocks.append(new_tb)
165 |   return new_text_blocks
166 | 
167 | 
168 | def choose_title(text_blocks, config):
169 |   """Return title as UTF-8 from list. Either all non-empty texts with font id
170 |   or just first."""
171 |   ## Have to encode output when piping script. See: http://goo.gl/h0ql0
172 |   for tb in text_blocks:
173 |     if config.multiline:
174 |       return ' '.join([t['text'] for t in tb['blockText']]).encode('utf-8')
175 |     else:
176 |       return tb['blockText'][0]['text'].encode('utf-8')
177 |   return None
178 | 
179 | 
180 | def format_upper_case(title, _config):
181 |   """Return the title in titlecase if all letters are uppercase."""
182 |   return title.title() if is_mostly_upper_case(title) else title
183 | 
184 | 
185 | def is_mostly_upper_case(string, threshold=0.67):
186 |   """Return True if string has over Threshold uppercase letters, else False."""
187 |   n = 0
188 |   for c in string:
189 |     if c.isupper() or c.isspace():
190 |       n = n+1
191 |   if float(n) / len(string) >= threshold:
192 |     return True
193 |   else:
194 |     False
195 | 
196 | 
197 | def format_weird_case(title, _config):
198 |   """Return the title in titlecase if all letters are uppercase."""
199 |   return title.title() if is_weird_case(title) else title
200 | 
201 | 
202 | def is_weird_case(string):
203 |   """Return True if given String has "weird" cases in case letters, else False.
204 |   Example: isWeirdCase('A FAult-tolerAnt token BAsed Algorithm') == True"""
205 |   for i in range(len(string) - 2):
206 |     if string[i].isalpha() and (
207 |        string[i+1].isupper() and string[i+2].islower() or
208 |        string[i+1].islower() and string[i+2].isupper()):
209 |       return True
210 |   return False
211 | 
212 | 
213 | def format_space_case(title, _config):
214 |   """Return the title removing gaps between letters."""
215 |   if is_space_case(title):
216 |     return unspace(title)
217 |   else:
218 |     return title
219 | 
220 | 
221 | def is_space_case(string, threshold=0.2):
222 |   """Return True if given String has many gaps between letters, else False.
223 |   Example: isSpaceCase('A H i gh - L e ve l F r am e w or k f or') == True"""
224 |   n = 0
225 |   for c in string:
226 |     if c.isspace():
227 |       n = n+1
228 |   if float(n) / len(string) >= threshold:
229 |     return True
230 |   else:
231 |     False
232 | 
233 | 
234 | def unspace(string):
235 |   """Return the given string without the many gaps between letters.
236 |   Example: unspace('A H i gh - L e ve l F r am e') == A High-Level Frame"""
237 |   joined_string = ''.join(string.split())
238 |   return re.sub(r'([^-])([A-Z])', r'\1 \2', joined_string)
239 | 
240 | 
241 | def format_multi_spaces(title, _config):
242 |   """Return the title with not more than one space per word separation."""
243 |   # TODO: These are actually two formatters in one
244 |   return ' '.join(title.split()).replace(' :', ':')
245 | 
246 | 
247 | def format_linebreak_dash(title, _config):
248 |   """Return the title without linebreak dash."""
249 |   return re.sub(r'(\S)- (.+)', r'\1-\2', title)
250 | 
251 | 
252 | def format_trailing_period(title, _config):
253 |   """Return the title without trailing period."""
254 |   return re.sub(r'^(.*)\.$', r'\1', title)
255 | 
256 | 
257 | def format_trailing_asterik(title, _config):
258 |   """Return the title without trailing asterik."""
259 |   return re.sub(r'^(.*)\*$', r'\1', title)
260 | 
261 | 
262 | def format_quotes(title, _config):
263 |   """Return the title with normalized quotes."""
264 |   return title.replace('‘‘', '“') \
265 |               .replace('’’', '”') \
266 |               .replace('``', '‟') \
267 |               .replace(',,', '„')
268 | 
269 | 
270 | # TODO: Generalize functionality to convert Unicode NFD->NFC.
271 | def format_ligatures(title, _config):
272 |   """Return the title without Ligatures."""
273 |   # For a reference of the list see: http://typophile.com/files/PMEJLigR_6061.GIF
274 |   # and https://github.com/Docear/PDF-Inspector/blob/master/src/org/docear/pdf/util/ReplaceLigaturesFilter.java
275 |   return title.replace('ﬁ', 'fi') \
276 |               .replace('ﬂ', 'fl')
277 | 
278 | 
279 | def transduce(funs, value, config):
280 |   """Return a value after applying a list of functions until list or value is
281 |   empty."""
282 |   if not (funs and value):
283 |     return value
284 |   return transduce(funs[1:], funs[0](value, config), config)
285 | 
286 | 
287 | def extract_title(path, config):
288 |   """Return title in PDF article after applying rules and filters."""
289 |   groupers = [
290 |   ]
291 |   filters = [
292 |     filter_empties,
293 |     filter_bottom_half,
294 |     filter_margin,
295 |     filter_vertical,
296 |     filter_shorts,
297 |     filter_longs,
298 |     filter_unrelated_lines,
299 |     choose_title
300 |   ]
301 |   formatters = [
302 |     format_ligatures,
303 |     format_upper_case,
304 |     format_weird_case,
305 |     format_space_case,
306 |     format_multi_spaces,
307 |     format_linebreak_dash,
308 |     format_trailing_period,
309 |     format_trailing_asterik,
310 |     format_quotes
311 |   ]
312 |   xml_data = convert_pdf_to_xml(path)
313 |   font_ids = sorted_font_ids(font_specs(xml_data))
314 |   text_blocks = [textblocks_by_id(xml_data, font_id) for font_id in font_ids]
315 |   return transduce(groupers + filters + formatters, text_blocks, config)
316 | 
317 | 
318 | def sanitize_filename(filename):
319 |   return filename.replace(':', ' -').replace('/', '-')
320 | 
321 | 
322 | def main(args):
323 |   """Find first non-empty text in PDF File with largest size and return as
324 |   unformatted string."""
325 |   try:
326 |     title = extract_title(args.file, args)
327 | 
328 |     if not title:
329 |       return 1, 'Error: No title found'
330 | 
331 |     print(title)
332 | 
333 |     if args.rename:
334 |       filename = '{}.pdf'.format(sanitize_filename(title))
335 |       filepath = os.path.join(os.path.dirname(args.file), filename)
336 |       os.rename(args.file, filepath)
337 | 
338 |     return 0,
339 |   except OSError:
340 |     return 4, 'Error: pdftohtml not found'
341 |   except subprocess.CalledProcessError:
342 |     return 5, 'Error: Could not convert PDF to XML'
343 |   except etree.ParseError:
344 |     return 6, 'Error: Could not parse XML'
345 |   except Exception as e:
346 |     if args.debug:
347 |       traceback.print_exc()
348 |     return 8, 'Error: Unknown error: ' + type(e).__name__
349 | 
350 | 
351 | def pos_int(v):
352 |   i = int(v)
353 |   if i > 0:
354 |     return i
355 |   raise argparse.ArgumentTypeError("invalid pos_int value: " % v)
356 | 
357 | 
358 | def filepath(v):
359 |   f = os.path.expanduser(v.strip())
360 |   if not os.path.isfile(f) and not os.path.islink(f):
361 |     raise argparse.ArgumentTypeError("file not found: " % v)
362 |   return f
363 | 
364 | 
365 | if __name__ == '__main__':
366 |   try:
367 |     parser = argparse.ArgumentParser(description='Tries to identify the title of PDF format paper.')
368 |     parser.add_argument('file', metavar='FILE', type=filepath, help='Path to PDF file')
369 |     parser.add_argument('-r', '--rename', action='store_true', help='Rename file with found title')
370 |     parser.add_argument('-m', '--multiline', action='store_true', default=True, help='Concatenate multiple title lines considered (default)')
371 |     parser.add_argument('-s', '--singleline', action='store_false', dest='multiline', help='Only use first title line considered')
372 |     parser.add_argument('-t', '--top-margin', type=pos_int, default=70, help='Top margin start to search for title (default: 70)')
373 |     parser.add_argument('-n', '--min-length', type=pos_int, default=15, help='Min. considerable title length (default: 15)')
374 |     parser.add_argument('-x', '--max-length', type=pos_int, default=250, help='Max. considerable title length (default: 250)')
375 |     parser.add_argument('-d', '--debug', action='store_true', default=False, help='Print error stacktrace for unknown errors')
376 |     parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + VERSION)
377 |     args = parser.parse_args()
378 |     parser.exit(*main(args))
379 |   except KeyboardInterrupt:
380 |     sys.exit(1)
381 | 


--------------------------------------------------------------------------------