├── test ├── samples │ ├── 029.pdf │ ├── 001.pdf │ ├── 002.pdf │ ├── 003.pdf │ ├── 004.pdf │ ├── 005.pdf │ ├── 006.pdf │ ├── 007.pdf │ ├── 008.pdf │ ├── 009.pdf │ ├── 010.pdf │ ├── 011.pdf │ ├── 012.pdf │ ├── 013.pdf │ ├── 014.pdf │ ├── 016.pdf │ ├── 017.pdf │ ├── 018.pdf │ ├── 019.pdf │ ├── 020.pdf │ ├── 021.pdf │ ├── 022.pdf │ ├── 023.pdf │ ├── 024.pdf │ ├── 025.pdf │ ├── 026.pdf │ ├── 027.pdf │ ├── 028.pdf │ ├── 030.pdf │ ├── 031.pdf │ ├── 032.pdf │ └── 033.pdf ├── bashunit.bash └── run.sh ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md └── pdftitle /test/samples/029.pdf: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | /_ 3 | -------------------------------------------------------------------------------- /test/samples/001.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/001.pdf -------------------------------------------------------------------------------- /test/samples/002.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/002.pdf -------------------------------------------------------------------------------- /test/samples/003.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/003.pdf -------------------------------------------------------------------------------- /test/samples/004.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/004.pdf -------------------------------------------------------------------------------- /test/samples/005.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/005.pdf -------------------------------------------------------------------------------- /test/samples/006.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/006.pdf -------------------------------------------------------------------------------- /test/samples/007.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/007.pdf -------------------------------------------------------------------------------- /test/samples/008.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/008.pdf -------------------------------------------------------------------------------- /test/samples/009.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/009.pdf -------------------------------------------------------------------------------- /test/samples/010.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/010.pdf -------------------------------------------------------------------------------- /test/samples/011.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/011.pdf -------------------------------------------------------------------------------- /test/samples/012.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/012.pdf -------------------------------------------------------------------------------- /test/samples/013.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/013.pdf -------------------------------------------------------------------------------- /test/samples/014.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/014.pdf -------------------------------------------------------------------------------- /test/samples/016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/016.pdf -------------------------------------------------------------------------------- /test/samples/017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/017.pdf -------------------------------------------------------------------------------- /test/samples/018.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/018.pdf -------------------------------------------------------------------------------- /test/samples/019.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/019.pdf -------------------------------------------------------------------------------- /test/samples/020.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/020.pdf -------------------------------------------------------------------------------- /test/samples/021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/021.pdf -------------------------------------------------------------------------------- /test/samples/022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/022.pdf -------------------------------------------------------------------------------- /test/samples/023.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/023.pdf -------------------------------------------------------------------------------- /test/samples/024.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/024.pdf -------------------------------------------------------------------------------- /test/samples/025.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/025.pdf -------------------------------------------------------------------------------- /test/samples/026.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/026.pdf -------------------------------------------------------------------------------- /test/samples/027.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/027.pdf -------------------------------------------------------------------------------- /test/samples/028.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/028.pdf -------------------------------------------------------------------------------- /test/samples/030.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/030.pdf -------------------------------------------------------------------------------- /test/samples/031.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/031.pdf -------------------------------------------------------------------------------- /test/samples/032.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/032.pdf -------------------------------------------------------------------------------- /test/samples/033.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/djui/pdftitle/HEAD/test/samples/033.pdf -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: bash 2 | 3 | sudo: false 4 | 5 | addons: 6 | apt: 7 | packages: 8 | - poppler-utils 9 | 10 | script: test/run.sh -v 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Uwe Dauernheim 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pdftitle 2 | 3 | The commandline tool `pdftitle` is a Python implementation of the 4 | *SciPlore Xtract*[1] paper, using mostly a structural layout analysis. 5 | 6 | By now, Docear has published the open-source tool 7 | [PDF Inspector](https://github.com/Docear/PDF-Inspector) which does roughly the 8 | same as this script. The differences are: 9 | 10 | - Written in Java 11 | - Uses ~~PDFBox~~ *jPod* instead of *pdftohtml* 12 | - Simplier heuristics 13 | 14 | > [1] *Joeran Beel, Bela Gipp, Ammar Shaker, and Nick Friedrich*. 15 | > [SciPlore Xtract: Extracting Titles from Scientific PDF documents by Analyzing 16 | > Style Information (Font Size)](http://docear.org/papers/SciPlore%20Xtract%20--%20Extracting%20Titles%20from%20Scientific%20PDF%20Documents%20by%20Analyzing%20Style%20Information%20%28Font%20Size%29-preprint.pdf). 17 | > In M. Lalmas, J. Jose, A. Rauber, F. Sebastiani, and I. Frommholz, editors, 18 | > Research and Advanced Technology for Digital Libraries, Proceedings of the 19 | > 14th European Conference on Digital Libraries (ECDL-10), volume 6273 of 20 | > Lecture Notes of Computer Science (LNCS), pages 413-416, Glasgow (UK), 21 | > September 2010. Springer. 22 | 23 | ![Travis CI Status](https://travis-ci.org/djui/pdftitle.svg) 24 | 25 | ## Background 26 | 27 | The title of a PDF article usually is in the filename but often is not. Next up 28 | would be to check the title of the PDF metadata (using e.g. `pdfinfo`) but this 29 | is also often not set or set incorrectly. Converting the PDF to text and picking 30 | the first line often gives false positives or incomplete titles. 31 | 32 | ## Usage 33 | 34 | $ pdftitle --help 35 | usage: pdftitle [-h] [-r] [-m] [-s] [-t TOP_MARGIN] [-n MIN_LENGTH] [-x MAX_LENGTH] [-d] [-v] FILE 36 | 37 | Tries to identify the title of PDF format paper. 38 | 39 | positional arguments: 40 | FILE Path to PDF file 41 | 42 | optional arguments: 43 | -h, --help show this help message and exit 44 | -r, --rename Rename file with found title 45 | -m, --multiline Concatenate multiple title lines considered (default) 46 | -s, --singleline Only use first title line considered 47 | -t TOP_MARGIN, --top-margin TOP_MARGIN 48 | Top margin start to search for title (default: 70) 49 | -n MIN_LENGTH, --min-length MIN_LENGTH 50 | Min. considerable title length (default: 15) 51 | -x MAX_LENGTH, --max-length MAX_LENGTH 52 | Max. considerable title length (default: 250) 53 | -d, --debug Print error stacktrace for unknown errors 54 | -v, --version show program's version number and exit 55 | 56 | 57 | ## Dependencies 58 | 59 | * Python >=2.5 60 | * [Poppler](http://poppler.freedesktop.org/) >=0.20.5 (contains `pdftohtml`) 61 | 62 | `$ brew install poppler` 63 | 64 | * [lxml](http://lxml.de/) (optional, for higher accuracy) 65 | 66 | `$ pip install lxml` 67 | 68 | 69 | ## Accuracy 70 | 71 | Version 1.0: A sample set of 261 PDFs in Biology science (which has many 72 | scanned PDFs) results in 60.08% success rate. 73 | 74 | Version 1.1: A sample set of 261 PDFs in Biology science (which has many 75 | scanned PDFs) results in 76.25% success rate. 76 | 77 | Version 1.2: No comparison available. (I lost the original sample set) 78 | 79 | Version 1.3: No comparison available. (I lost the original sample set) 80 | 81 | 82 | ## Contributing 83 | 84 | ### Testing 85 | 86 | $ ./test/run.sh -v 87 | 88 | 89 | ## Todos 90 | 91 | **Version 2.0**: I will likely switch from Poppler/pdftohtml to PDFBox (or JPod) 92 | to have no external dependencies. This will likely convert the script into a 93 | Java CLI application. I was tinkering with a Go/Rust version (as bindings to 94 | Poppler similar to [Go-Poppler](https://github.com/cheggaaa/go-poppler)) Let's 95 | see. 96 | 97 | 98 | ## License 99 | 100 | `pdftitle` is licenced under a 101 | [BSD License](https://github.com/djui/pdftitle/blob/master/LICENSE). 102 | -------------------------------------------------------------------------------- /test/bashunit.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################################################################## 4 | # USAGE 5 | ######################################################################## 6 | 7 | # Functions starting with 'test' will be automatically evaluated. 8 | # 9 | # 1. Write test cases 10 | # 11 | # : 12 | # testEcho() { 13 | # assertEqual "$(echo foo)" "foo" 14 | # assertReturn "$(echo foo)" 0 15 | # } 16 | # : 17 | # 18 | # 2. Include this script at the end of your test script 19 | # 20 | # : 21 | # source $(dirname $0)/bashunit.bash 22 | # # eof 23 | # 24 | # 3. Run test suite 25 | # 26 | # $ ./test_example 27 | # testEcho:4:Passed 28 | # testEcho:5:Passed 29 | # Done. 2 passed. 0 failed. 0 skipped. 30 | # 31 | # The return code is equal to the amount of failed testcases. 32 | # 33 | # Options can be given to the test script: 34 | # 35 | # $ bash ./bashunit.bash 36 | # Usage: [options...] 37 | # 38 | # Options: 39 | # -v, --verbose Print exptected and provided values 40 | # -s, --summary Only print summary omitting individual test results 41 | # -q, --quiet Do not print anything to standard output 42 | # -h, --help Show usage screen 43 | 44 | ######################################################################## 45 | # DEPENDENCIES 46 | ######################################################################## 47 | 48 | # * Bash (BASH_LINENO) 49 | # * Shell colours 50 | 51 | ######################################################################## 52 | # API 53 | ######################################################################## 54 | 55 | # * assertEqual($1, $2) 56 | # $1: Output 57 | # $2: Expected 58 | # 59 | # Assert that a given output string is equal to an expected string. 60 | 61 | # * assertNotEqual($1, $2) 62 | # $1: Output 63 | # $2: Expected 64 | # 65 | # Assert that a given output string is not equal to an expected 66 | # string. 67 | 68 | # * assertStartsWith($1, $2) 69 | # $1: Output 70 | # $2: Expected 71 | # 72 | # Assert that a given output string starts with an expected string. 73 | 74 | # * assertReturn($1, $2) 75 | # $1: Output 76 | # $2: Expected 77 | # $?: Provided 78 | # 79 | # Assert that the last command's return code is equal to an expected 80 | # integer. 81 | 82 | # * assertNotReturn($1, $2) 83 | # $1: Output 84 | # $2: Expected 85 | # $?: Provided 86 | # 87 | # Assert that the last command's return code is not equal to an 88 | # expected integer. 89 | 90 | # * skip() 91 | # 92 | # Skip the current test case. 93 | 94 | ######################################################################## 95 | # GLOBALS 96 | ######################################################################## 97 | 98 | verbose=2 99 | 100 | bashunit_passed=0 101 | bashunit_failed=0 102 | bashunit_skipped=0 103 | 104 | ######################################################################## 105 | # ASSERT FUNCTIONS 106 | ######################################################################## 107 | 108 | # $1: Output 109 | # $2: Expected 110 | assertEqual() { 111 | echo $1 | grep -E "^$2$" > /dev/null 112 | if [ $? -eq 0 ] ; then _passed ; else _failed "$1" "$2" ; fi 113 | } 114 | 115 | # $1: Output 116 | # $2: Expected 117 | assertNotEqual() { 118 | echo $1 | grep -E "^$2$" > /dev/null 119 | if [ $? -ne 0 ] ; then _passed ; else _failed "$1" "$2" ; fi 120 | } 121 | 122 | # $1: Output 123 | # $2: Expected 124 | assertStartsWith() { 125 | echo $1 | grep -E "^$2" > /dev/null 126 | if [ $? -eq 0 ] ; then _passed ; else _failed "$1" "$2" ; fi 127 | } 128 | 129 | # $1: Output 130 | # $2: Expected 131 | # $?: Provided 132 | assertReturn() { 133 | local code=$? 134 | if [ $code -eq $2 ] ; then _passed ; else _failed "$code" "$2" ; fi 135 | } 136 | 137 | # $1: Output 138 | # $2: Expected 139 | # $?: Provided 140 | assertNotReturn() { 141 | local code=$? 142 | if [ $code -ne $2 ] ; then _passed ; else _failed "$code" "$2" ; fi 143 | } 144 | 145 | skip() { 146 | _skipped 147 | } 148 | 149 | _failed() { 150 | bashunit_failed=$((bashunit_failed+1)) 151 | 152 | local tc=${FUNCNAME[2]} 153 | local line=${BASH_LINENO[1]} 154 | if [ $verbose -ge 2 ] ; then 155 | echo -e "\033[37;1m$tc\033[0m:$line:\033[31mFailed\033[0m" 156 | fi 157 | if [ $verbose -eq 3 ] ; then 158 | echo -e "\033[31mExpected\033[0m: $2" 159 | echo -e "\033[31mProvided\033[0m: $1" 160 | fi 161 | } 162 | 163 | _passed() { 164 | bashunit_passed=$((bashunit_passed+1)) 165 | 166 | local tc=${FUNCNAME[2]} 167 | local line=${BASH_LINENO[1]} 168 | if [ $verbose -ge 2 ] ; then 169 | echo -e "\033[37;1m$tc\033[0m:$line:\033[32mPassed\033[0m" 170 | fi 171 | } 172 | 173 | _skipped() { 174 | bashunit_skipped=$((bashunit_skipped+1)) 175 | 176 | local tc=${FUNCNAME[2]} 177 | local line=${BASH_LINENO[1]} 178 | if [ $verbose -ge 2 ] ; then 179 | echo -e "\033[37;1m$tc\033[0m:$line:\033[33mSkipped\033[0m" 180 | fi 181 | } 182 | 183 | ######################################################################## 184 | # RUN 185 | ######################################################################## 186 | 187 | usage() { 188 | echo "Usage: [options...]" 189 | echo 190 | echo "Options:" 191 | echo " -v, --verbose Print exptected and provided values" 192 | echo " -s, --summary Only print summary omitting individual test results" 193 | echo " -q, --quiet Do not print anything to standard output" 194 | echo " -h, --help Show usage screen" 195 | } 196 | 197 | runTests() { 198 | local test_pattern="test[a-zA-Z0-9_]\+" 199 | local testcases=$(grep "^ *\(function \)*$test_pattern *\\(\\)" $0 | \ 200 | grep -o $test_pattern) 201 | 202 | if [ ! "${testcases[*]}" ] ; then 203 | usage 204 | exit 0 205 | fi 206 | 207 | for tc in $testcases ; do $tc ; done 208 | 209 | if [ $verbose -ge 1 ] ; then 210 | echo "Done. $bashunit_passed passed." \ 211 | "$bashunit_failed failed." \ 212 | "$bashunit_skipped skipped." 213 | fi 214 | exit $bashunit_failed 215 | } 216 | 217 | # Arguments 218 | while [ $# -gt 0 ]; do 219 | arg=$1; shift 220 | case $arg in 221 | "-v"|"--verbose") verbose=3;; 222 | "-s"|"--summary") verbose=1;; 223 | "-q"|"--quiet") verbose=0;; 224 | "-h"|"--help") usage; exit 0;; 225 | *) shift;; 226 | esac 227 | done 228 | 229 | runTests 230 | -------------------------------------------------------------------------------- /test/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BASEDIR="$(dirname $0)/.." 4 | 5 | cd "$BASEDIR" 6 | 7 | ######################################################################## 8 | # Argument tests 9 | ######################################################################## 10 | 11 | testPositiveArgumentsHelp() { 12 | assertReturn "$(./pdftitle -h)" 0 13 | assertReturn "$(./pdftitle --help)" 0 14 | } 15 | 16 | testPositiveArgumentsVersion() { 17 | assertReturn "$(./pdftitle -v 2> /dev/null)" 0 18 | assertReturn "$(./pdftitle --version 2> /dev/null)" 0 19 | assertEqual "$(./pdftitle -v 2>&1)" "pdftitle 1.3" 20 | assertEqual "$(./pdftitle --version 2>&1)" "pdftitle 1.3" 21 | } 22 | 23 | ######################################################################## 24 | # Return value tests 25 | ######################################################################## 26 | 27 | testNegativeNotFound() { 28 | assertEqual "$(./pdftitle ./test/samples/026.pdf 2>&1)" \ 29 | "Error: No title found" 30 | assertReturn "$(./pdftitle ./test/samples/026.pdf 2> /dev/null)" 1 31 | } 32 | 33 | testNegativeArgumentsNoFile() { 34 | assertReturn "$(./pdftitle 2> /dev/null)" 2 35 | } 36 | 37 | testNegativeArgumentsUnknown() { 38 | assertReturn "$(./pdftitle --foo foobar 2> /dev/null)" 2 39 | } 40 | 41 | testNegativeArgumentsInvalid() { 42 | assertReturn "$(./pdftitle -t -1 foobar 2> /dev/null)" 2 43 | assertReturn "$(./pdftitle --top -1 foobar 2> /dev/null)" 2 44 | } 45 | 46 | testNegativeFileNotFound() { 47 | assertReturn "$(./pdftitle foobar 2> /dev/null)" 2 48 | # Skip reason: Too verbose 49 | skip assertEqual "$(./pdftitle foobar 2>&1)" "pdftitle: error: argument FILE: invalid filepath value: 'foobar'" 50 | } 51 | 52 | testNegativePDFTOHTMLNotFound() { 53 | # Skip reason: Hard to simulate 54 | skip assertReturn "$(./pdftitle foobar 2> /dev/null)" 4 55 | } 56 | 57 | testNegativeCouldNotConvertPDFToXML() { 58 | assertReturn "$(./pdftitle ./test/samples/029.pdf 2> /dev/null)" 5 59 | assertEqual "$(./pdftitle ./test/samples/029.pdf 2>&1)" \ 60 | "Error: Could not convert PDF to XML" 61 | } 62 | 63 | testNegativePasswordProtected() { 64 | assertReturn "$(./pdftitle ./test/samples/027.pdf 2> /dev/null)" 5 65 | assertEqual "$(./pdftitle ./test/samples/027.pdf 2>&1)" \ 66 | "Error: Could not convert PDF to XML" 67 | } 68 | 69 | testNegativeCouldNotParseXML() { 70 | # Skip reason: Does not fail under lxml 71 | skip assertReturn "$(./pdftitle ./test/samples/025.pdf 2> /dev/null)" 6 72 | skip assertEqual "$(./pdftitle ./test/samples/025.pdf 2>&1)" \ 73 | "Error: Could not parse XML" 74 | } 75 | 76 | testNegativeUnknownError() { 77 | # Skip reason: Hard to simulate 78 | skip assertReturn "$(./pdftitle foobar 2> /dev/null)" 8 79 | } 80 | 81 | ######################################################################## 82 | # Misc tests 83 | ######################################################################## 84 | 85 | testSimpleCorrectUndistracted() { 86 | assertEqual "$(./pdftitle ./test/samples/001.pdf)" \ 87 | "The Ecology of West Nile Virus in South Africa and the Occurrence of Outbreaks in Humans" 88 | } 89 | 90 | testSimpleMultiline() { 91 | assertEqual "$(./pdftitle ./test/samples/009.pdf)" \ 92 | "The Ecology of West Nile Virus in South Africa and the Occurrence of Outbreaks in Humans" 93 | } 94 | 95 | testTwoColumns() { 96 | assertEqual "$(./pdftitle ./test/samples/002.pdf)" \ 97 | "Outbreak of West Nile Virus Infection in Greece, 2010" 98 | } 99 | 100 | testNotHighestFont() { 101 | # Skip reason: Skips lines 102 | skip assertEqual "$(./pdftitle ./test/samples/011.pdf)" \ 103 | "Genetic Differences Between Culex pipiens f\. molestus and Culex pipiens pipiens \(Diptera: Culicidae\) in New York" 104 | } 105 | 106 | testMixedFormatting() { 107 | # Skip reason: Skips lines 108 | skip assertEqual "$(./pdftitle ./test/samples/011.pdf)" \ 109 | "Genetic Differences Between Culex pipiens f\. molestus and Culex pipiens pipiens \(Diptera: Culicidae\) in New York" 110 | } 111 | 112 | testSlightDifferentSizeForSameFont() { 113 | # Skip reason: Skips lines 114 | skip assertEqual "$(./pdftitle ./test/samples/019.pdf)" \ 115 | "HpaII endonuclease distinguishes between two species in the Anopheles funestus group" 116 | } 117 | 118 | testHiddenText() { 119 | # Skip reason: Skips lines 120 | skip assertEqual "$(./pdftitle ./test/samples/023.pdf)" \ 121 | "Phylogeny of fourteen Culex mosquito species, including the Culex pipiens complex, inferred from the internal transcribed spacers of ribosomal DNA" 122 | } 123 | 124 | testInvalidXML() { 125 | assertEqual "$(./pdftitle ./test/samples/024.pdf 2> /dev/null)" \ 126 | "Spread of the West Nile virus vector Culex modestus and the potential malaria vector Anopheles hyrcanus in central Europe" 127 | } 128 | 129 | testCopyProtected() { 130 | assertEqual "$(./pdftitle ./test/samples/028.pdf)" \ 131 | "Pogosta Disease: Clinical Observations During An Outbreak In The Province Of North Karelia, Finland" 132 | } 133 | 134 | ######################################################################## 135 | # Filter tests 136 | ######################################################################## 137 | 138 | testFilterImageAbove() { 139 | skip assertEqual "$(./pdftitle ./test/samples/003.pdf)" \ 140 | "A Comparison of Gravid and Under-House CO2-Baited CDC Light Traps for Mosquito Species of Public Health Importance in Houston, Texas" 141 | } 142 | 143 | testFilterNotLargestFound() { 144 | assertEqual "$(./pdftitle ./test/samples/005.pdf)" \ 145 | "Asymmetric introgression between sympatric molestus and pipiens forms of Culex pipiens \(Diptera: Culicidae\) in the Comporta region, Portugal" 146 | } 147 | 148 | testFilterTooShort() { 149 | assertEqual "$(./pdftitle ./test/samples/006.pdf)" \ 150 | "West Nile Fever in Czechland" 151 | } 152 | 153 | testFilterTooLong() { 154 | assertEqual "$(./pdftitle ./test/samples/020.pdf)" \ 155 | "A study of mosquito fauna \(Diptera: Culicidae\) and the phenology of the species recorded in Wilanów \(Warsaw, Poland\)" 156 | } 157 | 158 | testFilterVertical() { 159 | assertEqual "$(./pdftitle ./test/samples/031.pdf)" \ 160 | "Stratified B-trees and versioning dictionaries" 161 | } 162 | 163 | testFilterTooFarDistanceWithEqualFont() { 164 | assertEqual "$(./pdftitle ./test/samples/016.pdf)" \ 165 | "Mosquito species distribution in mainland Portugal 2005-2008" 166 | } 167 | 168 | testFilterOnLowerHalfOfFirstPage() { 169 | assertEqual "$(./pdftitle ./test/samples/021.pdf)" \ 170 | "Scandinavian Journal of Rheumatology" 171 | } 172 | 173 | testFilterInitialCapital() { 174 | assertEqual "$(./pdftitle ./test/samples/022.pdf)" \ 175 | "Syndromic Surveillance in The Netherlands for the Early Detection of West Nile Virus Epidemics" 176 | } 177 | 178 | ######################################################################## 179 | # Formatter tests 180 | ######################################################################## 181 | 182 | testFormatUpperCase() { 183 | assertEqual "$(./pdftitle ./test/samples/004.pdf)" \ 184 | "Influence Of Landscape Structure On Mosquitoes \(Diptera: Culicidae\) And Dytiscids \(Coleoptera: Dytiscidae\) At Five Spatial Scales In Swedish Wetlands" 185 | } 186 | 187 | testFormatWeirdCase() { 188 | assertEqual "$(./pdftitle ./test/samples/030.pdf)" \ 189 | "Atomic Broadcast: A Fault-Tolerant Token Based Algorithm And Performance Evaluations" 190 | } 191 | 192 | testFormatSpaceCase() { 193 | # Skip reason: Word boundaries swallowed 194 | skip assertEqual "$(./pdftitle ./test/samples/033.pdf)" \ 195 | "A High-Level Framework for Distributed Processing of Large-Scale Graphs" 196 | } 197 | 198 | testFormatUpperCaseMixedFormatting() { 199 | assertEqual "$(./pdftitle ./test/samples/008.pdf)" \ 200 | "Evaluation Of Six Mosquito Traps For Collection Of Aedes Albopictus And Associated Mosquito Species In A Suburban Setting In North Central Florida" 201 | } 202 | 203 | testFormatSubscript() { 204 | # Skip reason: Chemical expression incorrectly handled 205 | skip assertEqual "$(./pdftitle ./test/samples/010.pdf)" \ 206 | "Validation Of CO2 Trap Data In Three European Regions" 207 | } 208 | 209 | testFormatLinebreakDash() { 210 | assertEqual "$(./pdftitle ./test/samples/012.pdf)" \ 211 | "The prevalence of antibodies against Sindbis-related \(Pogosta\) virus in different parts of Finland" 212 | } 213 | 214 | testFormatPeriod() { 215 | assertEqual "$(./pdftitle ./test/samples/031.pdf)" \ 216 | "Stratified B-trees and versioning dictionaries" 217 | } 218 | 219 | testFormatAsterik() { 220 | assertEqual "$(./pdftitle ./test/samples/032.pdf)" \ 221 | "Ecophysiological and morphological variations in mosquitoes of the Culex pipiens complex \(Diptera: Culicidae\)" 222 | } 223 | 224 | testFormatLigatures() { 225 | # Skip reason: pdftohtml produces gibberish 226 | skip assertEqual "$(./pdftitle ./test/samples/017.pdf)" \ 227 | "Mitochondrial DNA cytochrome oxidase I gene: potential for distinction between immature stages of some forensically important fly species \(Diptera\) in western Australia" 228 | } 229 | 230 | testFormatQuotes() { 231 | # Skip reason: Quotes are not correctly detected 232 | skip assertEqual "$(./pdftitle ./test/samples/013.pdf)" \ 233 | "“Coi”-like Sequences Are Becoming Problematic In Molecular Systematic And Dna Barcoding Studies" 234 | } 235 | 236 | testFormatMultipleSpaces() { 237 | # Skip reason: No good pdf found to test the feature 238 | skip assertEqual "$(./pdftitle ./test/samples/0XX.pdf 2> /dev/null)" "" 239 | } 240 | 241 | testFormatFormattingSwitchOnNewLine() { 242 | # Skip reason: Skips a word 243 | skip assertEqual "$(./pdftitle ./test/samples/014.pdf)" \ 244 | "Rapid Assay To Identify The Two Genetic Forms Of Culex \(Culex\) Pipiens L\. \(Diptera: Culicidae\) And Hybrid Populations" 245 | } 246 | 247 | testFormatTwoCharacterEncoding() { 248 | skip assertEqual "$(./pdftitle ./test/samples/018.pdf)" \ 249 | "Serological Examination of Songbirds \(Passeriformes\) for Mosquito-Borne Viruses Sindbis, Ťahynǎ, and Batai in a South Moravian Wetland \(Czech Republic\)" 250 | } 251 | 252 | ######################################################################## 253 | 254 | source "./test/bashunit.bash" 255 | -------------------------------------------------------------------------------- /pdftitle: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import print_function 4 | 5 | import argparse 6 | import copy 7 | import os 8 | import re 9 | import subprocess 10 | import sys 11 | import traceback 12 | from cStringIO import StringIO 13 | try: 14 | from lxml import etree 15 | parse_xml = lambda s: etree.parse(s, etree.XMLParser(recover=True)) 16 | except ImportError: 17 | import xml.etree.ElementTree as etree 18 | parse_xml = lambda s: etree.parse(s) 19 | 20 | 21 | VERSION = '1.3' 22 | 23 | 24 | def convert_pdf_to_xml(path): 25 | """Return XML string of converted PDF file.""" 26 | cmd = ['pdftohtml', '-xml', '-f', '1', '-l', '1', '-i', '-q', '-nodrm', '-hidden', '-stdout', path] 27 | xml_string = subprocess.check_output(cmd, stderr=open(os.devnull, 'w')) 28 | return parse_xml(StringIO(remove_control_chars(xml_string))) 29 | 30 | 31 | def remove_control_chars(string): 32 | """Filter ASCII control characters as etree treats them as invalid.""" 33 | return ''.join([i for i in string if ord(i) in [9, 10, 13] or ord(i) >= 32]) 34 | 35 | 36 | def font_specs(xml_data): 37 | """Return all font specifications in XML.""" 38 | xml_font_specs = xml_data.findall('page[@number="1"]/fontspec[@id][@size]') 39 | return [fs.attrib for fs in xml_font_specs] 40 | 41 | 42 | def sorted_font_ids(font_specs): 43 | """Return sorted font specifications by size decending.""" 44 | font_specs = sorted(font_specs, key=lambda x: int(x['size']), reverse=True) 45 | return [fs['id'] for fs in font_specs] 46 | 47 | 48 | def textblocks_by_id(xml_data, font_id): 49 | """Return text blocks given font id.""" 50 | text_elements = xml_data.findall('page[@number="1"]/text[@font="%s"]' % font_id) 51 | first_page_top = int(xml_data.findall('page[@number="1"]')[0].get('top')) 52 | first_page_height = int(xml_data.findall('page[@number="1"]')[0].get('height')) 53 | return top_and_texts(text_elements, first_page_top, first_page_height) 54 | 55 | 56 | def top_and_texts(text_elements, page_top, page_height): 57 | """Return top position of first non-empty text line and all 58 | unformatted non-empty text lines, and some extra (page) metadata. 59 | Example: { 60 | 'pageTop': 0, 61 | 'pageHeight': 1263, 62 | 'blockTop': 16, 63 | 'blockText': [ 64 | {'top': 16, 'height': 24, 'text': 'foo'}, 65 | {'top': 30, 'height': 24, 'text': 'bar'}, 66 | {'top': 44, 'height': 24, 'text': 'baz'} 67 | ] 68 | }""" 69 | text_lines = [] 70 | top = page_top 71 | 72 | for text_element in text_elements: 73 | text_line = unformat_and_strip(text_element) 74 | if not text_line: 75 | continue 76 | t = int(text_element.get('top')) 77 | h = int(text_element.get('height')) 78 | w = int(text_element.get('width')) 79 | # TODO: Maybe allow a light error here 80 | # if T < Top - Error: 81 | # TODO: This is actually a filter 82 | if t < top: 83 | # Ignore text lines positioned upwards. Only look downwards. 84 | continue 85 | top = t 86 | text_lines.append({ 87 | 'top': t, 88 | 'height': h, 89 | 'width': w, 90 | 'text': text_line 91 | }) 92 | 93 | if text_lines and top > page_top: 94 | return { 95 | 'pageTop': page_top, 96 | 'pageHeight': page_height, 97 | 'blockTop': min(text_lines, key=lambda x: x['top'])['top'], 98 | 'blockText': text_lines 99 | } 100 | else: 101 | return {} 102 | 103 | 104 | def filter_empties(text_blocks, _config): 105 | """Filter emtpy text blocks.""" 106 | return [tb for tb in text_blocks if tb and tb['blockText']] 107 | 108 | 109 | def unformat_and_strip(text_element): 110 | """Return non-empty unformatted text element.""" 111 | return ''.join(text_element.itertext()).strip() 112 | 113 | 114 | def filter_bottom_half(text_blocks, _config): 115 | """Filter text blocks on lower half of page.""" 116 | return [tb for tb in text_blocks if 117 | tb['blockTop'] - tb['pageTop'] < tb['pageHeight'] / 2] 118 | 119 | 120 | def filter_margin(text_blocks, config): 121 | """Filter text blocks above certain top margin.""" 122 | return [tb for tb in text_blocks if tb['blockTop'] > config.top_margin] 123 | 124 | 125 | def filter_vertical(text_blocks, _config): 126 | """Filter text blocks with vertical text.""" 127 | new_text_blocks = [] 128 | for tb in text_blocks: 129 | new_tb = copy.copy(tb) 130 | new_tb['blockText'] = [] 131 | for t in tb['blockText']: 132 | if t['width'] > 0: 133 | new_tb['blockText'].append(t) 134 | if new_tb['blockText']: 135 | new_text_blocks.append(new_tb) 136 | return new_text_blocks 137 | 138 | 139 | def filter_shorts(text_blocks, config): 140 | """Filter text lines which are too short thus unlikely titles.""" 141 | return [tb for tb in text_blocks if 142 | len(' '.join([t['text'] for t in tb['blockText']])) >= config.min_length] 143 | 144 | 145 | def filter_longs(text_blocks, config): 146 | """Filter text lines which are too long thus unlikely titles.""" 147 | return [tb for tb in text_blocks if 148 | len(' '.join([t['text'] for t in tb['blockText']])) <= config.max_length] 149 | 150 | 151 | def filter_unrelated_lines(text_blocks, _config): 152 | """Filter text lines in text blocks that are too far away from previous 153 | lines.""" 154 | new_text_blocks = [] 155 | for tb in text_blocks: 156 | new_tb = copy.copy(tb) 157 | new_tb['blockText'] = [] 158 | next_top = tb['blockTop'] 159 | for t in tb['blockText']: 160 | if t['top'] < next_top + t['height'] / 2: 161 | next_top = t['top'] + t['height'] 162 | new_tb['blockText'].append(t) 163 | if new_tb['blockText']: 164 | new_text_blocks.append(new_tb) 165 | return new_text_blocks 166 | 167 | 168 | def choose_title(text_blocks, config): 169 | """Return title as UTF-8 from list. Either all non-empty texts with font id 170 | or just first.""" 171 | ## Have to encode output when piping script. See: http://goo.gl/h0ql0 172 | for tb in text_blocks: 173 | if config.multiline: 174 | return ' '.join([t['text'] for t in tb['blockText']]).encode('utf-8') 175 | else: 176 | return tb['blockText'][0]['text'].encode('utf-8') 177 | return None 178 | 179 | 180 | def format_upper_case(title, _config): 181 | """Return the title in titlecase if all letters are uppercase.""" 182 | return title.title() if is_mostly_upper_case(title) else title 183 | 184 | 185 | def is_mostly_upper_case(string, threshold=0.67): 186 | """Return True if string has over Threshold uppercase letters, else False.""" 187 | n = 0 188 | for c in string: 189 | if c.isupper() or c.isspace(): 190 | n = n+1 191 | if float(n) / len(string) >= threshold: 192 | return True 193 | else: 194 | False 195 | 196 | 197 | def format_weird_case(title, _config): 198 | """Return the title in titlecase if all letters are uppercase.""" 199 | return title.title() if is_weird_case(title) else title 200 | 201 | 202 | def is_weird_case(string): 203 | """Return True if given String has "weird" cases in case letters, else False. 204 | Example: isWeirdCase('A FAult-tolerAnt token BAsed Algorithm') == True""" 205 | for i in range(len(string) - 2): 206 | if string[i].isalpha() and ( 207 | string[i+1].isupper() and string[i+2].islower() or 208 | string[i+1].islower() and string[i+2].isupper()): 209 | return True 210 | return False 211 | 212 | 213 | def format_space_case(title, _config): 214 | """Return the title removing gaps between letters.""" 215 | if is_space_case(title): 216 | return unspace(title) 217 | else: 218 | return title 219 | 220 | 221 | def is_space_case(string, threshold=0.2): 222 | """Return True if given String has many gaps between letters, else False. 223 | Example: isSpaceCase('A H i gh - L e ve l F r am e w or k f or') == True""" 224 | n = 0 225 | for c in string: 226 | if c.isspace(): 227 | n = n+1 228 | if float(n) / len(string) >= threshold: 229 | return True 230 | else: 231 | False 232 | 233 | 234 | def unspace(string): 235 | """Return the given string without the many gaps between letters. 236 | Example: unspace('A H i gh - L e ve l F r am e') == A High-Level Frame""" 237 | joined_string = ''.join(string.split()) 238 | return re.sub(r'([^-])([A-Z])', r'\1 \2', joined_string) 239 | 240 | 241 | def format_multi_spaces(title, _config): 242 | """Return the title with not more than one space per word separation.""" 243 | # TODO: These are actually two formatters in one 244 | return ' '.join(title.split()).replace(' :', ':') 245 | 246 | 247 | def format_linebreak_dash(title, _config): 248 | """Return the title without linebreak dash.""" 249 | return re.sub(r'(\S)- (.+)', r'\1-\2', title) 250 | 251 | 252 | def format_trailing_period(title, _config): 253 | """Return the title without trailing period.""" 254 | return re.sub(r'^(.*)\.$', r'\1', title) 255 | 256 | 257 | def format_trailing_asterik(title, _config): 258 | """Return the title without trailing asterik.""" 259 | return re.sub(r'^(.*)\*$', r'\1', title) 260 | 261 | 262 | def format_quotes(title, _config): 263 | """Return the title with normalized quotes.""" 264 | return title.replace('‘‘', '“') \ 265 | .replace('’’', '”') \ 266 | .replace('``', '‟') \ 267 | .replace(',,', '„') 268 | 269 | 270 | # TODO: Generalize functionality to convert Unicode NFD->NFC. 271 | def format_ligatures(title, _config): 272 | """Return the title without Ligatures.""" 273 | # For a reference of the list see: http://typophile.com/files/PMEJLigR_6061.GIF 274 | # and https://github.com/Docear/PDF-Inspector/blob/master/src/org/docear/pdf/util/ReplaceLigaturesFilter.java 275 | return title.replace('fi', 'fi') \ 276 | .replace('fl', 'fl') 277 | 278 | 279 | def transduce(funs, value, config): 280 | """Return a value after applying a list of functions until list or value is 281 | empty.""" 282 | if not (funs and value): 283 | return value 284 | return transduce(funs[1:], funs[0](value, config), config) 285 | 286 | 287 | def extract_title(path, config): 288 | """Return title in PDF article after applying rules and filters.""" 289 | groupers = [ 290 | ] 291 | filters = [ 292 | filter_empties, 293 | filter_bottom_half, 294 | filter_margin, 295 | filter_vertical, 296 | filter_shorts, 297 | filter_longs, 298 | filter_unrelated_lines, 299 | choose_title 300 | ] 301 | formatters = [ 302 | format_ligatures, 303 | format_upper_case, 304 | format_weird_case, 305 | format_space_case, 306 | format_multi_spaces, 307 | format_linebreak_dash, 308 | format_trailing_period, 309 | format_trailing_asterik, 310 | format_quotes 311 | ] 312 | xml_data = convert_pdf_to_xml(path) 313 | font_ids = sorted_font_ids(font_specs(xml_data)) 314 | text_blocks = [textblocks_by_id(xml_data, font_id) for font_id in font_ids] 315 | return transduce(groupers + filters + formatters, text_blocks, config) 316 | 317 | 318 | def sanitize_filename(filename): 319 | return filename.replace(':', ' -').replace('/', '-') 320 | 321 | 322 | def main(args): 323 | """Find first non-empty text in PDF File with largest size and return as 324 | unformatted string.""" 325 | try: 326 | title = extract_title(args.file, args) 327 | 328 | if not title: 329 | return 1, 'Error: No title found' 330 | 331 | print(title) 332 | 333 | if args.rename: 334 | filename = '{}.pdf'.format(sanitize_filename(title)) 335 | filepath = os.path.join(os.path.dirname(args.file), filename) 336 | os.rename(args.file, filepath) 337 | 338 | return 0, 339 | except OSError: 340 | return 4, 'Error: pdftohtml not found' 341 | except subprocess.CalledProcessError: 342 | return 5, 'Error: Could not convert PDF to XML' 343 | except etree.ParseError: 344 | return 6, 'Error: Could not parse XML' 345 | except Exception as e: 346 | if args.debug: 347 | traceback.print_exc() 348 | return 8, 'Error: Unknown error: ' + type(e).__name__ 349 | 350 | 351 | def pos_int(v): 352 | i = int(v) 353 | if i > 0: 354 | return i 355 | raise argparse.ArgumentTypeError("invalid pos_int value: " % v) 356 | 357 | 358 | def filepath(v): 359 | f = os.path.expanduser(v.strip()) 360 | if not os.path.isfile(f) and not os.path.islink(f): 361 | raise argparse.ArgumentTypeError("file not found: " % v) 362 | return f 363 | 364 | 365 | if __name__ == '__main__': 366 | try: 367 | parser = argparse.ArgumentParser(description='Tries to identify the title of PDF format paper.') 368 | parser.add_argument('file', metavar='FILE', type=filepath, help='Path to PDF file') 369 | parser.add_argument('-r', '--rename', action='store_true', help='Rename file with found title') 370 | parser.add_argument('-m', '--multiline', action='store_true', default=True, help='Concatenate multiple title lines considered (default)') 371 | parser.add_argument('-s', '--singleline', action='store_false', dest='multiline', help='Only use first title line considered') 372 | parser.add_argument('-t', '--top-margin', type=pos_int, default=70, help='Top margin start to search for title (default: 70)') 373 | parser.add_argument('-n', '--min-length', type=pos_int, default=15, help='Min. considerable title length (default: 15)') 374 | parser.add_argument('-x', '--max-length', type=pos_int, default=250, help='Max. considerable title length (default: 250)') 375 | parser.add_argument('-d', '--debug', action='store_true', default=False, help='Print error stacktrace for unknown errors') 376 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + VERSION) 377 | args = parser.parse_args() 378 | parser.exit(*main(args)) 379 | except KeyboardInterrupt: 380 | sys.exit(1) 381 | --------------------------------------------------------------------------------