├── .coveragerc ├── .gitignore ├── .travis.yml ├── CHANGES.rst ├── CHANGES_RECENT.rst ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── TODO.rst ├── dist └── pypdfocr.exe ├── docs ├── Makefile ├── conf.py ├── index.rst ├── make.bat └── pypdfocr.rst ├── fabfile.py ├── pypdfocr.spec ├── pypdfocr ├── __init__.py ├── pypdfocr.py ├── pypdfocr.spec ├── pypdfocr_filer.py ├── pypdfocr_filer_dirs.py ├── pypdfocr_filer_evernote.py ├── pypdfocr_gs.py ├── pypdfocr_interrupts.py ├── pypdfocr_multiprocessing.py ├── pypdfocr_pdf.py ├── pypdfocr_pdffiler.py ├── pypdfocr_preprocess.py ├── pypdfocr_tesseract.py ├── pypdfocr_util.py ├── pypdfocr_watcher.py └── version.py ├── pypdfocr_windows.spec ├── requirements.txt ├── setup.py └── test ├── pdfs ├── 1.pdf ├── test.pdf ├── test_cinderella.pdf ├── test_patent.pdf ├── test_recipe.pdf ├── test_recipe_sideways.pdf ├── test_sherlock.pdf └── test_super_long_keyword.pdf ├── runtests.py ├── temp └── original │ ├── test_patent.pdf │ ├── test_patent_1.pdf │ ├── test_recipe.pdf │ ├── test_recipe_1.pdf │ ├── test_sherlock.pdf │ └── test_sherlock_1.pdf ├── test_evernote.py ├── test_gs.py ├── test_option_config.yaml ├── test_option_parsing.py ├── test_pdf_filer.py ├── test_pypdfocr.py ├── test_pypdfocr_config.yaml ├── test_pypdfocr_config_filename.yaml ├── test_pypdfocr_config_no_move_original.yaml ├── test_tesseract.py └── test_watcher.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | exclude_lines = 3 | 4 | pragma: no cover 5 | if __name__ == '__main__': 6 | def error(text): 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .* 3 | *~ 4 | *.hocr 5 | *.jpg 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | install: 5 | - "pip install -r requirements.txt --use-mirrors" 6 | - "pip install pytest mock --use-mirrors" 7 | - "pip install ." 8 | script: 9 | - "python setup.py test" 10 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | ======= ======== ====== 2 | Version Date Changes 3 | ------- -------- ------ 4 | 5 | v0.9.1 10/11/16 Fixes (#43, #41) 6 | v0.9.0 2/29/16 Fixed rotated page text, Mac OS X invisible fonts, and pdf merge slowdown 7 | v0.8.5 2/21/16 Better ctrl-c and cleanup behavior 8 | v0.8.4 2/18/16 Maintenance release 9 | v0.8.3 2/18/16 Bug fix for multiprocessing on windows, ctrl-c interrupt, and integer keywords 10 | v0.8.2 12/8/14 Fixed imagemagick invocation on windows. Parallelized preprocessing and tesseract execution 11 | v0.8.1 12/5/14 Added --skip-preprocess option, scan_interval option, and fixed too many open files bug during page overlay 12 | v0.8.0 10/27/14 Added preprocessing to clean up prior to tesseract, bug fixes on file names with spaces/dots 13 | v0.7.6 9/10/14 Fixed issue 17 rotation bug 14 | v0.7.5 8/18/14 Update for Tesseract 3.03 .hocr filename change 15 | v0.7.4 3/28/14 Bug fix on pdf assembly 16 | v0.7.3 3/27/14 Modified internals to use single image per page (instead of multipage tiff). Also enabled orientation detection 17 | v0.7.2 3/26/14 Switched from Pil to Pillow. Now uses original images from PDF in output pdf (no dpi/color/quality changes!) 18 | v0.7.1 3/25/14 OCR Language is now an option 19 | v0.7.0 3/25/14 Now honors original pdf resolution 20 | v0.6.1 2/16/14 Bug fix for pdfs with only numbers in the filename 21 | v0.6.0 1/16/14 Added filing based on filename match as fallback, added tesseract version check 22 | v0.5.4 1/12/14 Fixed bug with reordering of text pages on certain platforms(glob) 23 | v0.5.3 12/12/13 Fix to evernote server specification 24 | v0.5.2 12/08/13 Fix to lowercase keywords 25 | v0.5.1 11/02/13 Fixed a bunch of windows critical path handling issues 26 | v0.5.0 10/30/13 Email status added, 90% test coverage 27 | v0.4.1 10/28/13 Made HOCR parsing more robust 28 | v0.4.0 10/28/13 Added early Evernote upload support 29 | v0.3.1 10/24/13 Path fix on windows 30 | v0.3.0 10/23/13 Added filing of converted pdfs using a configuration file to specify target directories based on keyword matches in the pdf text 31 | v0.2.2 10/22/13 Added a console script to put the pypdfocr script into your bin 32 | v0.2.1 10/22/13 Fix to initial packaging problem. 33 | v0.2.0 10/21/13 Initial release. 34 | ======= ======== ====== 35 | -------------------------------------------------------------------------------- /CHANGES_RECENT.rst: -------------------------------------------------------------------------------- 1 | ======= ======== ====== 2 | Version Date Changes 3 | ------- -------- ------ 4 | 5 | v0.9.0 2/29/16 Fixed rotated page text, Mac OS X invisible fonts, and pdf merge slowdown 6 | v0.8.5 2/21/16 Better ctrl-c and cleanup behavior 7 | v0.8.4 2/18/16 Maintenance release 8 | v0.8.3 2/18/16 Bug fix for multiprocessing on windows, ctrl-c interrupt, and integer keywords 9 | v0.8.2 12/8/14 Fixed imagemagick invocation on windows. Parallelized preprocessing and tesseract execution 10 | v0.8.1 12/5/14 Added --skip-preprocess option, scan_interval option, and fixed too many open files bug during page overlay 11 | ======= ======== ====== 12 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [2013] [Virantha Ekanayake] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.rst 3 | 4 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | PyPDFOCR - Tesseract-OCR based PDF filing 2 | ========================================= 3 | 4 | |image0| |image1| |image2| |passing| |quality| |Coverage Status| 5 | 6 | This program will help manage your scanned PDFs by doing the following: 7 | 8 | - Take a scanned PDF file and run OCR on it (using the Tesseract OCR 9 | software from Google), generating a searchable PDF 10 | - Optionally, watch a folder for incoming scanned PDFs and 11 | automatically run OCR on them 12 | - Optionally, file the scanned PDFs into directories based on simple 13 | keyword matching that you specify 14 | - Evernote auto-upload and filing based on keyword search 15 | - Email status when it files your PDF 16 | 17 | More links: 18 | 19 | - `Blog @ virantha.com `__ 20 | - `Documentation @ gitpages `__ 21 | - `Source @ github `__ 22 | 23 | Usage: 24 | ###### 25 | 26 | Single conversion: 27 | ~~~~~~~~~~~~~~~~~~ 28 | 29 | :: 30 | 31 | pypdfocr filename.pdf 32 | 33 | --> filename_ocr.pdf will be generated 34 | 35 | If you have a language pack installed, then you can specify it with the 36 | ``-l`` option: 37 | 38 | :: 39 | 40 | pypdfocr -l spa filename.pdf 41 | 42 | Folder monitoring: 43 | ~~~~~~~~~~~~~~~~~~ 44 | 45 | :: 46 | 47 | pypdfocr -w watch_directory 48 | 49 | --> Every time a pdf file is added to `watch_directory` it will be OCR'ed 50 | 51 | Automatic filing: 52 | ~~~~~~~~~~~~~~~~~ 53 | 54 | To automatically move the OCR'ed pdf to a directory based on a keyword, 55 | use the -f option and specify a configuration file (described below): 56 | 57 | :: 58 | 59 | pypdfocr filename.pdf -f -c config.yaml 60 | 61 | You can also do this in folder monitoring mode: 62 | 63 | :: 64 | 65 | pypdfocr -w watch_directory -f -c config.yaml 66 | 67 | Filing based on filename match: 68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 69 | 70 | If no keywords match the contents of the filename, you can optionally 71 | allow it to fallback to trying to find keyword matches with the PDF 72 | filename using the -n option. For example, you may have receipts always 73 | named as ``receipt_2013_12_2.pdf`` by your scanner, and you want to move 74 | this to a folder called 'receipts'. Assuming you have a keyword 75 | ``receipt`` matching to folder ``receipts`` in your configuration file 76 | as described below, you can run the following and have this filed even 77 | if the content of the pdf does not contain the text 'receipt': 78 | 79 | :: 80 | 81 | pypdfocr filename.pdf -f -c config.yaml -n 82 | 83 | Configuration file for automatic PDF filing 84 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 85 | 86 | The config.yaml file above is a simple folder to keyword matching text 87 | file. It determines where your OCR'ed PDFs (and optionally, the original 88 | scanned PDF) are placed after processing. An example is given below: 89 | 90 | :: 91 | 92 | target_folder: "docs/filed" 93 | default_folder: "docs/filed/manual_sort" 94 | original_move_folder: "docs/originals" 95 | 96 | folders: 97 | finances: 98 | - american express 99 | - chase card 100 | - internal revenue service 101 | travel: 102 | - boarding pass 103 | - airlines 104 | - expedia 105 | - orbitz 106 | receipts: 107 | - receipt 108 | 109 | The ``target_folder`` is the root of your filing cabinet. Any PDF moving 110 | will happen in sub-directories under this directory. 111 | 112 | The ``folders`` section defines your filing directories and the keywords 113 | associated with them. In this example, we have three filing directories 114 | (finances, travl, receipts), and some associated keywords for each 115 | filing directory. For example, if your OCR'ed PDF contains the phrase 116 | "american express" (in any upper/lower case), it will be filed into 117 | ``docs/filed/finances`` 118 | 119 | The ``default_folder`` is where the OCR'ed PDF is moved to if there is 120 | no keyword match. 121 | 122 | The ``original_move_folder`` is optional (you can comment it out with 123 | ``#`` in front of that line), but if specified, the original scanned PDF 124 | is moved into this directory after OCR is done. Otherwise, if this field 125 | is not present or commented out, your original PDF will stay where it 126 | was found. 127 | 128 | If there is any naming conflict during filing, the program will add an 129 | underscore followed by a number to each filename, in order to avoid 130 | overwriting files that may already be present. 131 | 132 | Evernote upload: 133 | ~~~~~~~~~~~~~~~~ 134 | 135 | Evernote authentication token 136 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 137 | 138 | To enable Evernote support, you will need to `get a developer token for 139 | your Evernote 140 | account. `__. You 141 | should note that this script will never delete or modify existing notes 142 | in your account, and limits itself to creating new Notebooks and Notes. 143 | Once you get that token, you copy and paste it into your configuration 144 | file as shown below 145 | 146 | Evernote filing usage 147 | ^^^^^^^^^^^^^^^^^^^^^ 148 | 149 | To automatically upload the OCR'ed pdf to a folder based on a keyword, 150 | use the ``-e`` option instead of the ``-f`` auto filing option. 151 | 152 | :: 153 | 154 | pypdfocr filename.pdf -e -c config.yaml 155 | 156 | Similarly, you can also do this in folder monitoring mode: 157 | 158 | :: 159 | 160 | pypdfocr -w watch_directory -e -c config.yaml 161 | 162 | Evernote filing configuration file 163 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 164 | 165 | The config file shown above only needs to change slightly. The folders 166 | section is completely unchanged, but note that ``target_folder`` is the 167 | name of your "Notebook stack" in Evernote, and the ``default_folder`` 168 | should just be the default Evernote upload notebook name. 169 | 170 | :: 171 | 172 | target_folder: "evernote_stack" 173 | default_folder: "default" 174 | original_move_folder: "docs/originals" 175 | evernote_developer_token: "YOUR_TOKEN" 176 | 177 | folders: 178 | finances: 179 | - american express 180 | - chase card 181 | - internal revenue service 182 | travel: 183 | - boarding pass 184 | - airlines 185 | - expedia 186 | - orbitz 187 | receipts: 188 | - receipt 189 | 190 | Auto email 191 | ~~~~~~~~~~ 192 | 193 | You can have PyPDFOCR email you everytime it converts a file and files 194 | it. You need to first specify the following lines in the configuration 195 | file and then use the ``-m`` option when invoking ``pypdfocr``: 196 | 197 | :: 198 | 199 | mail_smtp_server: "smtp.gmail.com:587" 200 | mail_smtp_login: "virantha@gmail.com" 201 | mail_smtp_password: "PASSWORD" 202 | mail_from_addr: "virantha@gmail.com" 203 | mail_to_list: 204 | - "virantha@gmail.com" 205 | - "person2@gmail.com" 206 | 207 | 208 | Advanced options 209 | ################ 210 | 211 | Fine-tuning Tesseract/Ghostscript/others 212 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 213 | 214 | You can specify Tesseract and Ghostscript executable locations manually, as 215 | well as the number of concurrent processes allowed during preprocessing and 216 | tesseract. Use the following in your configuration file: 217 | 218 | :: 219 | 220 | tesseract: 221 | binary: "/usr/bin/tesseract" 222 | threads: 8 223 | 224 | ghostscript: 225 | binary: "/usr/local/bin/gs" 226 | 227 | preprocess: 228 | threads: 8 229 | 230 | Handling disk time-outs 231 | ~~~~~~~~~~~~~~~~~~~~~~~ 232 | If you need to increase the time interval (default 3 seconds) between new 233 | document scans when pypdfocr is watching a directory, you can specify the following 234 | option in the configuration file: 235 | 236 | :: 237 | 238 | watch: 239 | scan_interval: 6 240 | 241 | Installation 242 | ############ 243 | 244 | Using pip 245 | ~~~~~~~~~ 246 | 247 | PyPDFOCR is available in PyPI, so you can just run: 248 | 249 | :: 250 | 251 | pip install pypdfocr 252 | 253 | Please note that some of the 3rd-party libraries required by PyPDFOCR wiill 254 | require some build tools, especially on a default Ubuntu system. If you run 255 | into any issues using pip install, you may want to install the 256 | following packages on Ubuntu and try again: 257 | 258 | - gcc 259 | - libjpeg-dev 260 | - zlib-bin 261 | - zlib1g-dev 262 | - python-dev 263 | 264 | For those on **Windows**, because it's such a pain to get all the PIL 265 | and PDF dependencies installed, I've gone ahead and made an executable 266 | called 267 | `pypdfocr.exe `__ 268 | 269 | You still need to install Tesseract, GhostScript, etc. as detailed below in 270 | the external dependencies list. 271 | 272 | Manual install 273 | ~~~~~~~~~~~~~~ 274 | 275 | Clone the source directly from github (you need to have git installed): 276 | 277 | :: 278 | 279 | git clone https://github.com/virantha/pypdfocr.git 280 | 281 | Then, install the following third-party python libraries: 282 | 283 | - Pillow (Python Imaging Library) https://pillow.readthedocs.org/en/3.1.x/ 284 | - ReportLab (PDF generation library) 285 | http://www.reportlab.com/opensource/ 286 | - Watchdog (Cross-platform fhlesystem events monitoring) 287 | https://pypi.python.org/pypi/watchdog 288 | - PyPDF2 (Pure python pdf library) 289 | 290 | These can all be installed via pip: 291 | 292 | :: 293 | 294 | pip install Pillow 295 | pip install reportlab 296 | pip install watchdog 297 | pip install pypdf2 298 | 299 | 300 | You will also need to install the external dependencies listed below. 301 | 302 | External Dependencies 303 | ~~~~~~~~~~~~~~~~~~~~~ 304 | 305 | PyPDFOCR relies on the following (free) programs being installed and in 306 | the path: 307 | 308 | - Tesseract OCR software https://code.google.com/p/tesseract-ocr/ 309 | - GhostScript http://www.ghostscript.com/ 310 | - ImageMagick http://www.imagemagick.org/ 311 | - Poppler http://poppler.freedesktop.org/ (`Windows `__) 312 | 313 | Poppler is only required if you want pypdfocr to figure out the original PDF resolution 314 | automatically; just make sure you have ``pdfimages`` in your path. Note that the 315 | `xpdf `__ provided ``pdfimages`` does not work for this, 316 | because it does not support the ``-list`` option to list the table of images in a PDF file. 317 | 318 | On Mac OS X, you can install these using homebrew: 319 | 320 | :: 321 | 322 | brew install tesseract 323 | brew install ghostscript 324 | brew install poppler 325 | brew install imagemagick 326 | 327 | On Windows, please use the installers provided on their download pages. 328 | 329 | \*\* Important \*\* Tesseract version 3.02.02 or newer required 330 | (apparently 3.02.01-6 and possibly others do not work due to a hocr 331 | output format change that I'm not planning to address). On Ubuntu, you 332 | may need to compile and install it manually by following `these 333 | instructions `__ 334 | 335 | Also note that if you want Tesseract to recognize rotated documents (upside down, or rotated 90 degrees) 336 | then you need to find your tessdata directory and do the following: 337 | 338 | :: 339 | 340 | cd /usr/local/share/tessdata 341 | cp eng.traineddata osd.traineddata 342 | 343 | ``osd`` stands for Orientation and Script Detection, so you need to copy the .traineddata 344 | for whatever language you want to scan in as ``osd.traineddata``. If you don't do this step, 345 | then any landscape document will produce garbage 346 | 347 | Disclaimer 348 | ########## 349 | 350 | While test coverage is at 84% right now, Sphinx docs generation is at an 351 | early stage. The software is distributed on an "AS IS" BASIS, WITHOUT 352 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 353 | 354 | .. |image0| image:: https://badge.fury.io/py/pypdfocr.png 355 | :target: https://pypi.python.org/pypi/pypdfocr 356 | .. |image1| image:: https://pypip.in/d/pypdfocr/badge.png 357 | .. |image2| image:: https://pypip.in/license/pypdfocr/badge.png 358 | .. |passing| image:: https://scrutinizer-ci.com/g/virantha/pypdfocr/badges/build.png?b=master 359 | .. |quality| image:: https://scrutinizer-ci.com/g/virantha/pypdfocr/badges/quality-score.png?b=master 360 | .. |Coverage Status| image:: https://coveralls.io/repos/virantha/pypdfocr/badge.png?branch=develop 361 | :target: https://coveralls.io/r/virantha/pypdfocr 362 | -------------------------------------------------------------------------------- /TODO.rst: -------------------------------------------------------------------------------- 1 | Todo list 2 | ========= 3 | 4 | - #43 version check for tesseract 5 | - On windows, search for pdfimages and imagemagick instead of relying on path 6 | - Split up into flow steps 7 | - Run more robustness tests for watching networked shares 8 | - Add more docstrings 9 | - Add more option specifiers to tesseract and ghostscript 10 | -------------------------------------------------------------------------------- /dist/pypdfocr.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/dist/pypdfocr.exe -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = /Users/virantha/dev/githubdocs/pypdfocr 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pypdfocr.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pypdfocr.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pypdfocr" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pypdfocr" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # pypdfocr documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Oct 23 13:43:29 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | import pkg_resources 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | #sys.path.insert(0, os.path.abspath('.')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | #needs_sphinx = '1.0' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | 'sphinx.ext.autodoc', 34 | 'sphinx.ext.viewcode', 35 | ] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # The suffix of source filenames. 41 | source_suffix = '.rst' 42 | 43 | # The encoding of source files. 44 | #source_encoding = 'utf-8-sig' 45 | 46 | # The master toctree document. 47 | master_doc = 'index' 48 | 49 | # General information about the project. 50 | project = u'pypdfocr' 51 | copyright = u'2013, Author' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | # The short X.Y version. 58 | version = '' 59 | try: 60 | release = pkg_resources.get_distribution('pypdfocr').version 61 | except pkg_resources.DistributionNotFound: 62 | print 'To build the documentation, The distribution information of sandman' 63 | print 'Has to be available. Either install the package into your' 64 | print 'development environment or run "setup.py develop" to setup the' 65 | print 'metadata. A virtualenv is recommended!' 66 | sys.exit(1) 67 | del pkg_resources 68 | 69 | version = '.'.join(release.split('.')[:2]) 70 | # The full version, including alpha/beta/rc tags. 71 | 72 | # The language for content autogenerated by Sphinx. Refer to documentation 73 | # for a list of supported languages. 74 | #language = None 75 | 76 | # There are two options for replacing |today|: either, you set today to some 77 | # non-false value, then it is used: 78 | #today = '' 79 | # Else, today_fmt is used as the format for a strftime call. 80 | #today_fmt = '%B %d, %Y' 81 | 82 | # List of patterns, relative to source directory, that match files and 83 | # directories to ignore when looking for source files. 84 | exclude_patterns = ['_build'] 85 | 86 | # The reST default role (used for this markup: `text`) to use for all 87 | # documents. 88 | #default_role = None 89 | 90 | # If true, '()' will be appended to :func: etc. cross-reference text. 91 | #add_function_parentheses = True 92 | 93 | # If true, the current module name will be prepended to all description 94 | # unit titles (such as .. function::). 95 | #add_module_names = True 96 | 97 | # If true, sectionauthor and moduleauthor directives will be shown in the 98 | # output. They are ignored by default. 99 | #show_authors = False 100 | 101 | # The name of the Pygments (syntax highlighting) style to use. 102 | pygments_style = 'sphinx' 103 | 104 | # A list of ignored prefixes for module index sorting. 105 | #modindex_common_prefix = [] 106 | 107 | # If true, keep warnings as "system message" paragraphs in the built documents. 108 | #keep_warnings = False 109 | 110 | 111 | # -- Options for HTML output ---------------------------------------------- 112 | 113 | # The theme to use for HTML and HTML Help pages. See the documentation for 114 | # a list of builtin themes. 115 | html_theme = 'sphinxdoc' 116 | 117 | # Theme options are theme-specific and customize the look and feel of a theme 118 | # further. For a list of options available for each theme, see the 119 | # documentation. 120 | #html_theme_options = {} 121 | 122 | # Add any paths that contain custom themes here, relative to this directory. 123 | #html_theme_path = [] 124 | 125 | # The name for this set of Sphinx documents. If None, it defaults to 126 | # " v documentation". 127 | #html_title = None 128 | 129 | # A shorter title for the navigation bar. Default is the same as html_title. 130 | #html_short_title = None 131 | 132 | # The name of an image file (relative to this directory) to place at the top 133 | # of the sidebar. 134 | #html_logo = None 135 | 136 | # The name of an image file (within the static path) to use as favicon of the 137 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 138 | # pixels large. 139 | #html_favicon = None 140 | 141 | # Add any paths that contain custom static files (such as style sheets) here, 142 | # relative to this directory. They are copied after the builtin static files, 143 | # so a file named "default.css" will overwrite the builtin "default.css". 144 | html_static_path = ['_static'] 145 | 146 | # Add any extra paths that contain custom files (such as robots.txt or 147 | # .htaccess) here, relative to this directory. These files are copied 148 | # directly to the root of the documentation. 149 | #html_extra_path = [] 150 | 151 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 152 | # using the given strftime format. 153 | #html_last_updated_fmt = '%b %d, %Y' 154 | 155 | # If true, SmartyPants will be used to convert quotes and dashes to 156 | # typographically correct entities. 157 | #html_use_smartypants = True 158 | 159 | # Custom sidebar templates, maps document names to template names. 160 | #html_sidebars = {} 161 | 162 | # Additional templates that should be rendered to pages, maps page names to 163 | # template names. 164 | #html_additional_pages = {} 165 | 166 | # If false, no module index is generated. 167 | #html_domain_indices = True 168 | 169 | # If false, no index is generated. 170 | #html_use_index = True 171 | 172 | # If true, the index is split into individual pages for each letter. 173 | #html_split_index = False 174 | 175 | # If true, links to the reST sources are added to the pages. 176 | #html_show_sourcelink = True 177 | 178 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 179 | #html_show_sphinx = True 180 | 181 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 182 | #html_show_copyright = True 183 | 184 | # If true, an OpenSearch description file will be output, and all pages will 185 | # contain a tag referring to it. The value of this option must be the 186 | # base URL from which the finished HTML is served. 187 | #html_use_opensearch = '' 188 | 189 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 190 | #html_file_suffix = None 191 | 192 | # Output file base name for HTML help builder. 193 | htmlhelp_basename = 'pypdfocrdoc' 194 | 195 | 196 | # -- Options for LaTeX output --------------------------------------------- 197 | 198 | latex_elements = { 199 | # The paper size ('letterpaper' or 'a4paper'). 200 | #'papersize': 'letterpaper', 201 | 202 | # The font size ('10pt', '11pt' or '12pt'). 203 | #'pointsize': '10pt', 204 | 205 | # Additional stuff for the LaTeX preamble. 206 | #'preamble': '', 207 | } 208 | 209 | # Grouping the document tree into LaTeX files. List of tuples 210 | # (source start file, target name, title, 211 | # author, documentclass [howto, manual, or own class]). 212 | latex_documents = [ 213 | ('index', 'pypdfocr.tex', u'pypdfocr Documentation', 214 | u'Author', 'manual'), 215 | ] 216 | 217 | # The name of an image file (relative to this directory) to place at the top of 218 | # the title page. 219 | #latex_logo = None 220 | 221 | # For "manual" documents, if this is true, then toplevel headings are parts, 222 | # not chapters. 223 | #latex_use_parts = False 224 | 225 | # If true, show page references after internal links. 226 | #latex_show_pagerefs = False 227 | 228 | # If true, show URL addresses after external links. 229 | #latex_show_urls = False 230 | 231 | # Documents to append as an appendix to all manuals. 232 | #latex_appendices = [] 233 | 234 | # If false, no module index is generated. 235 | #latex_domain_indices = True 236 | 237 | 238 | # -- Options for manual page output --------------------------------------- 239 | 240 | # One entry per manual page. List of tuples 241 | # (source start file, name, description, authors, manual section). 242 | man_pages = [ 243 | ('index', 'pypdfocr', u'pypdfocr Documentation', 244 | [u'Author'], 1) 245 | ] 246 | 247 | # If true, show URL addresses after external links. 248 | #man_show_urls = False 249 | 250 | 251 | # -- Options for Texinfo output ------------------------------------------- 252 | 253 | # Grouping the document tree into Texinfo files. List of tuples 254 | # (source start file, target name, title, author, 255 | # dir menu entry, description, category) 256 | texinfo_documents = [ 257 | ('index', 'pypdfocr', u'pypdfocr Documentation', 258 | u'Author', 'pypdfocr', 'One line description of project.', 259 | 'Miscellaneous'), 260 | ] 261 | 262 | # Documents to append as an appendix to all manuals. 263 | #texinfo_appendices = [] 264 | 265 | # If false, no module index is generated. 266 | #texinfo_domain_indices = True 267 | 268 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 269 | #texinfo_show_urls = 'footnote' 270 | 271 | # If true, do not generate a @detailmenu in the "Top" node's menu. 272 | #texinfo_no_detailmenu = False 273 | 274 | 275 | # -- Options for Epub output ---------------------------------------------- 276 | 277 | # Bibliographic Dublin Core info. 278 | epub_title = u'pypdfocr' 279 | epub_author = u'Author' 280 | epub_publisher = u'Author' 281 | epub_copyright = u'2013, Author' 282 | 283 | # The basename for the epub file. It defaults to the project name. 284 | #epub_basename = u'pypdfocr' 285 | 286 | # The HTML theme for the epub output. Since the default themes are not optimized 287 | # for small screen space, using the same theme for HTML and epub output is 288 | # usually not wise. This defaults to 'epub', a theme designed to save visual 289 | # space. 290 | #epub_theme = 'epub' 291 | 292 | # The language of the text. It defaults to the language option 293 | # or en if the language is not set. 294 | #epub_language = '' 295 | 296 | # The scheme of the identifier. Typical schemes are ISBN or URL. 297 | #epub_scheme = '' 298 | 299 | # The unique identifier of the text. This can be a ISBN number 300 | # or the project homepage. 301 | #epub_identifier = '' 302 | 303 | # A unique identification for the text. 304 | #epub_uid = '' 305 | 306 | # A tuple containing the cover image and cover page html template filenames. 307 | #epub_cover = () 308 | 309 | # A sequence of (type, uri, title) tuples for the guide element of content.opf. 310 | #epub_guide = () 311 | 312 | # HTML files that should be inserted before the pages created by sphinx. 313 | # The format is a list of tuples containing the path and title. 314 | #epub_pre_files = [] 315 | 316 | # HTML files shat should be inserted after the pages created by sphinx. 317 | # The format is a list of tuples containing the path and title. 318 | #epub_post_files = [] 319 | 320 | # A list of files that should not be packed into the epub file. 321 | #epub_exclude_files = [] 322 | 323 | # The depth of the table of contents in toc.ncx. 324 | #epub_tocdepth = 3 325 | 326 | # Allow duplicate toc entries. 327 | #epub_tocdup = True 328 | 329 | # Choose between 'default' and 'includehidden'. 330 | #epub_tocscope = 'default' 331 | 332 | # Fix unsupported image types using the PIL. 333 | #epub_fix_images = False 334 | 335 | # Scale large images. 336 | #epub_max_image_width = 0 337 | 338 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 339 | #epub_show_urls = 'inline' 340 | 341 | # If false, no index is generated. 342 | #epub_use_index = True 343 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pypdfocr documentation master file, created by 2 | sphinx-quickstart on Wed Oct 23 13:43:29 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | PyPDFOCR API Reference (version |release|) 7 | ========================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 4 13 | 14 | pypdfocr 15 | 16 | Recent Changes 17 | ============== 18 | .. include:: ../CHANGES_RECENT.rst 19 | 20 | 21 | Testing 22 | ================ 23 | `Coverage `_ 24 | 25 | .. include:: ../README.rst 26 | 27 | Changelog 28 | ========= 29 | .. include:: ../CHANGES.rst 30 | 31 | .. include:: ../TODO.rst 32 | 33 | Indices and tables 34 | ================== 35 | 36 | * :ref:`genindex` 37 | * :ref:`modindex` 38 | * :ref:`search` 39 | 40 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pypdfocr.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pypdfocr.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /docs/pypdfocr.rst: -------------------------------------------------------------------------------- 1 | pypdfocr package 2 | ================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | pypdfocr.pypdfocr module 8 | ------------------------ 9 | 10 | .. automodule:: pypdfocr.pypdfocr 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | :private-members: 15 | 16 | pypdfocr.pypdfocr_gs module 17 | --------------------------- 18 | 19 | .. automodule:: pypdfocr.pypdfocr_gs 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | :private-members: 24 | 25 | pypdfocr.pypdfocr_pdf module 26 | ---------------------------- 27 | 28 | .. automodule:: pypdfocr.pypdfocr_pdf 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | :private-members: 33 | 34 | pypdfocr.pypdfocr_pdffiler module 35 | --------------------------------- 36 | 37 | .. automodule:: pypdfocr.pypdfocr_pdffiler 38 | :members: 39 | :undoc-members: 40 | :show-inheritance: 41 | :private-members: 42 | 43 | pypdfocr.pypdfocr_tesseract module 44 | ---------------------------------- 45 | 46 | .. automodule:: pypdfocr.pypdfocr_tesseract 47 | :members: 48 | :undoc-members: 49 | :show-inheritance: 50 | :private-members: 51 | 52 | pypdfocr.pypdfocr_watcher module 53 | -------------------------------- 54 | 55 | .. automodule:: pypdfocr.pypdfocr_watcher 56 | :members: 57 | :undoc-members: 58 | :show-inheritance: 59 | :private-members: 60 | 61 | pypdfocr.pypdfocr_preprocess module 62 | ----------------------------------- 63 | 64 | .. automodule:: pypdfocr.pypdfocr_preprocess 65 | :members: 66 | :undoc-members: 67 | :show-inheritance: 68 | :private-members: 69 | 70 | pypdfocr.pypdfocr_filer module 71 | -------------------------------- 72 | 73 | .. automodule:: pypdfocr.pypdfocr_filer 74 | :members: 75 | :undoc-members: 76 | :show-inheritance: 77 | :private-members: 78 | 79 | pypdfocr.pypdfocr_filer_dirs module 80 | ------------------------------------ 81 | 82 | .. automodule:: pypdfocr.pypdfocr_filer_dirs 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | :private-members: 87 | 88 | pypdfocr.pypdfocr_filer_evernote module 89 | ---------------------------------------- 90 | 91 | .. automodule:: pypdfocr.pypdfocr_filer_evernote 92 | :members: 93 | :undoc-members: 94 | :show-inheritance: 95 | :private-members: 96 | 97 | .. automethod:: _check_and_make_notebook(self,notebook_name) 98 | 99 | Module contents 100 | --------------- 101 | 102 | .. automodule:: pypdfocr 103 | :members: 104 | :undoc-members: 105 | :show-inheritance: 106 | :private-members: 107 | -------------------------------------------------------------------------------- /fabfile.py: -------------------------------------------------------------------------------- 1 | from fabric.api import * 2 | import os 3 | 4 | 5 | def build_windows_dist(): 6 | if os.name == 'nt': 7 | # Call the pyinstaller 8 | local("python ../pyinstaller/pyinstaller.py pypdfocr_windows.spec --onefile") 9 | 10 | 11 | def run_tests(): 12 | test_dir = "test" 13 | with lcd(test_dir): 14 | # Regenerate the test script 15 | local("py.test --genscript=runtests.py") 16 | t = local("py.test --cov-config .coveragerc --cov=pypdfocr --cov-report=term --cov-report=html", capture=False) 17 | t = local("coveralls") 18 | 19 | #with open("test/COVERAGE.rst", "w") as f: 20 | #f.write(t) 21 | 22 | 23 | def push_docs(): 24 | """ Build the sphinx docs from develop 25 | And push it to gh-pages 26 | """ 27 | githubpages = "/Users/virantha/dev/githubdocs/pypdfocr" 28 | # Convert markdown readme to rst 29 | #local("pandoc README.md -f markdown -t rst -o README.rst") 30 | with lcd(githubpages): 31 | local("git checkout gh-pages") 32 | local("git pull origin gh-pages") 33 | local("head CHANGES.rst > CHANGES_RECENT.rst") 34 | local("tail -n 1 CHANGES.rst >> CHANGES_RECENT.rst") 35 | with lcd("docs"): 36 | print("Running sphinx in docs/ and building to ~/dev/githubpages/pypdfocr") 37 | local("make clean") 38 | local("make html") 39 | local("cp -R ../test/htmlcov %s/html/testing" % githubpages) 40 | with lcd(githubpages): 41 | local("git add .") 42 | local('git commit -am "doc update"') 43 | local('git push origin gh-pages') 44 | 45 | -------------------------------------------------------------------------------- /pypdfocr.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python -*- 2 | a = Analysis(['pypdfocr/pypdfocr.py'], 3 | pathex=['/Users/virantha/dev/ocr'], 4 | hiddenimports=[], 5 | hookspath=None) 6 | pyz = PYZ(a.pure) 7 | exe = EXE(pyz, 8 | a.scripts, 9 | exclude_binaries=1, 10 | name=os.path.join('build/pyi.darwin/pypdfocr', 'pypdfocr'), 11 | debug=False, 12 | strip=None, 13 | upx=True, 14 | console=True ) 15 | coll = COLLECT(exe, 16 | a.binaries, 17 | a.zipfiles, 18 | a.datas, 19 | strip=None, 20 | upx=True, 21 | name=os.path.join('dist', 'pypdfocr')) 22 | -------------------------------------------------------------------------------- /pypdfocr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/pypdfocr/__init__.py -------------------------------------------------------------------------------- /pypdfocr/pypdfocr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import smtplib 17 | import argparse 18 | import sys, os, traceback, time 19 | import logging 20 | import shutil, glob 21 | import itertools 22 | from functools import wraps 23 | 24 | from version import __version__ 25 | from PIL import Image 26 | import yaml 27 | 28 | import multiprocessing 29 | # Replace the Popen routine to allow win32 pyinstaller to build 30 | from multiprocessing import forking 31 | from pypdfocr_multiprocessing import _Popen 32 | forking.Popen = _Popen 33 | 34 | from pypdfocr_pdf import PyPdf 35 | from pypdfocr_tesseract import PyTesseract 36 | from pypdfocr_gs import PyGs 37 | from pypdfocr_watcher import PyPdfWatcher 38 | from pypdfocr_pdffiler import PyPdfFiler 39 | from pypdfocr_filer_dirs import PyFilerDirs 40 | from pypdfocr_filer_evernote import PyFilerEvernote 41 | from pypdfocr_preprocess import PyPreprocess 42 | 43 | def error(text): 44 | print("ERROR: %s" % text) 45 | sys.exit(-1) 46 | 47 | # decorator to retry multiple times 48 | def retry(count=5, exc_type = Exception): 49 | def decorator(func): 50 | @wraps(func) 51 | def result(*args, **kwargs): 52 | for _ in range(count): 53 | try: 54 | return func(*args, **kwargs) 55 | except exc_type: 56 | pass 57 | raise 58 | return result 59 | return decorator 60 | 61 | @retry(count=6, exc_type=IOError) 62 | def open_file_with_timeout(parser, arg): 63 | f = open(arg, 'r') 64 | return f 65 | 66 | """ 67 | Make scanned PDFs searchable using Tesseract-OCR and autofile them 68 | .. automodule:: pypdfocr 69 | :private-members: 70 | """ 71 | 72 | class PyPDFOCR(object): 73 | """ 74 | The main clas. Performs the following functions: 75 | 76 | * Parses command line options 77 | * Optionally just watches a directory for new PDF's to OCR; once a file appears, it does the next step 78 | * Runs a single file conversion: 79 | * Runs ghostscript to get tiff/jpg 80 | * Runs Tesseract-OCR to do the actual OCR 81 | * Takes the HOCR from Tesseract and creates a new PDF with the text overlay 82 | * Files the OCR'ed file in the proper place if specified 83 | * Files the original file if specified 84 | * 85 | """ 86 | 87 | def __init__ (self): 88 | """ Initializes the GhostScript, Tesseract, and PDF helper classes. 89 | """ 90 | self.config = {} 91 | 92 | def _get_config_file(self, config_file): 93 | """ 94 | Read in the yaml config file 95 | 96 | :param config_file: Configuration file (YAML format) 97 | :type config_file: file 98 | :returns: dict of yaml file 99 | :rtype: dict 100 | """ 101 | with config_file: 102 | myconfig = yaml.load(config_file) 103 | return myconfig 104 | 105 | 106 | 107 | def get_options(self, argv): 108 | """ 109 | Parse the command-line options and set the following object properties: 110 | 111 | :param argv: usually just sys.argv[1:] 112 | :returns: Nothing 113 | 114 | :ivar debug: Enable logging debug statements 115 | :ivar verbose: Enable verbose logging 116 | :ivar enable_filing: Whether to enable post-OCR filing of PDFs 117 | :ivar pdf_filename: Filename for single conversion mode 118 | :ivar watch_dir: Directory to watch for files to convert 119 | :ivar config: Dict of the config file 120 | :ivar watch: Whether folder watching mode is turned on 121 | :ivar enable_evernote: Enable filing to evernote 122 | 123 | """ 124 | p = argparse.ArgumentParser( 125 | description = "Convert scanned PDFs into their OCR equivalent. Depends on GhostScript and Tesseract-OCR being installed.", 126 | epilog = "PyPDFOCR version %s (Copyright 2013 Virantha Ekanayake)" % __version__, 127 | ) 128 | 129 | p.add_argument('-d', '--debug', action='store_true', 130 | default=False, dest='debug', help='Turn on debugging') 131 | 132 | p.add_argument('-v', '--verbose', action='store_true', 133 | default=False, dest='verbose', help='Turn on verbose mode') 134 | 135 | p.add_argument('-m', '--mail', action='store_true', 136 | default=False, dest='mail', help='Send email after conversion') 137 | 138 | p.add_argument('-l', '--lang', 139 | default='eng', dest='lang', help='Language(default eng)') 140 | 141 | 142 | p.add_argument('--preprocess', action='store_true', 143 | default=False, dest='preprocess', help='Enable preprocessing. Not really useful now with improved Tesseract 3.04+') 144 | 145 | p.add_argument('--skip-preprocess', action='store_true', 146 | default=False, dest='skip_preprocess', help='DEPRECATED: always skips now.') 147 | 148 | #--------- 149 | # Single or watch mode 150 | #-------- 151 | single_or_watch_group = p.add_mutually_exclusive_group(required=True) 152 | # Positional argument for single file conversion 153 | single_or_watch_group.add_argument("pdf_filename", nargs="?", help="Scanned pdf file to OCR") 154 | # Watch directory for watch mode 155 | single_or_watch_group.add_argument('-w', '--watch', 156 | dest='watch_dir', help='Watch given directory and run ocr automatically until terminated') 157 | 158 | #----------- 159 | # Filing options 160 | #---------- 161 | filing_group = p.add_argument_group(title="Filing optinos") 162 | filing_group.add_argument('-f', '--file', action='store_true', 163 | default=False, dest='enable_filing', help='Enable filing of converted PDFs') 164 | #filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), 165 | filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x), 166 | dest='configfile', help='Configuration file for defaults and PDF filing') 167 | filing_group.add_argument('-e', '--evernote', action='store_true', 168 | default=False, dest='enable_evernote', help='Enable filing to Evernote') 169 | filing_group.add_argument('-n', action='store_true', 170 | default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder') 171 | 172 | 173 | # Add flow option to single mode extract_images,preprocess,ocr,write 174 | 175 | args = p.parse_args(argv) 176 | 177 | self.debug = args.debug 178 | self.verbose = args.verbose 179 | self.pdf_filename = args.pdf_filename 180 | self.lang = args.lang 181 | self.watch_dir = args.watch_dir 182 | self.enable_email = args.mail 183 | self.match_using_filename = args.match_using_filename 184 | 185 | 186 | # Deprecating skip_preprocess to make skipping the default (always true). Tesseract 3.04 is so much better now 187 | # at handling non-ideal inputs and lines 188 | if args.skip_preprocess: 189 | print("Warning: --skip_preprocess is not needed anymore (defaults to skipping preprocessing). If you want to enable preprocessing, use the new --preprocess option") 190 | self.skip_preprocess = True 191 | 192 | if args.preprocess: 193 | self.skip_preprocess = False 194 | 195 | if self.debug: 196 | logging.basicConfig(level=logging.DEBUG, format='%(message)s') 197 | 198 | if self.verbose: 199 | logging.basicConfig(level=logging.INFO, format='%(message)s') 200 | 201 | # Parse configuration file (YAML) if specified 202 | if args.configfile: 203 | self.config = self._get_config_file(args.configfile) 204 | logging.debug("Read in configuration file") 205 | logging.debug(self.config) 206 | 207 | if args.enable_evernote: 208 | self.enable_evernote = True 209 | else: 210 | self.enable_evernote = False 211 | 212 | if args.enable_filing or args.enable_evernote: 213 | self.enable_filing = True 214 | if not args.configfile: 215 | p.error("Please specify a configuration file(CONFIGFILE) to enable filing") 216 | else: 217 | self.enable_filing = False 218 | 219 | self.watch = False 220 | 221 | if args.watch_dir: 222 | logging.debug("Starting to watch") 223 | self.watch = True 224 | 225 | if self.enable_email: 226 | if not args.configfile: 227 | p.error("Please specify a configuration file(CONFIGFILE) to enable email") 228 | 229 | def _clean_up_files(self, files): 230 | """ 231 | Helper function to delete files 232 | :param files: List of files to delete 233 | :type files: list 234 | :returns: None 235 | """ 236 | for f in files: 237 | try: 238 | os.remove(f) 239 | except: 240 | logging.debug("Error removing file %s .... continuing" % f) 241 | 242 | 243 | 244 | def _setup_filing(self): 245 | """ 246 | Instance the proper PyFiler object (either 247 | :class:`pypdfocr.pypdfocr_filer_dirs.PyFilerDirs` or 248 | :class:`pypdfocr.pypdfocr_filer_evernote.PyFilerEvernote`) 249 | 250 | TODO: Make this more generic to allow third-party plugin filing objects 251 | 252 | :ivar filer: :class:`pypdfocr.pypdfocr_filer.PyFiler` PyFiler subclass object that is instantiated 253 | :ivar pdf_filer: :class:`pypdfocr.pypdfocr_pdffiler.PyPdfFiler` object to help with PDF reading 254 | :returns: Nothing 255 | 256 | """ 257 | # Look at self.config and create a self.pdf_filer object 258 | 259 | # -------------------------------------------------- 260 | # Some sanity checks 261 | # -------------------------------------------------- 262 | assert(self.config and self.enable_filing) 263 | for required in ['target_folder', 'default_folder']: 264 | if not required in self.config: 265 | error ("%s must be specified in config file" % required) 266 | else: 267 | # Make sure these required folders are in abspath format 268 | self.config[required] = os.path.abspath(self.config[required]) 269 | if 'original_move_folder' in self.config: 270 | # User wants to move the original after filing 271 | orig = 'original_move_folder' 272 | self.config[orig] = os.path.abspath(self.config[orig]) 273 | if not os.path.exists(self.config[orig]): 274 | os.makedirs(self.config[orig]) 275 | original_move_folder = self.config[orig] 276 | else: 277 | original_move_folder = None 278 | # -------------------------------------------------- 279 | # Start the filing object 280 | # -------------------------------------------------- 281 | if self.enable_evernote: 282 | self.filer = PyFilerEvernote(self.config['evernote_developer_token']) 283 | else: 284 | self.filer = PyFilerDirs() 285 | 286 | self.filer.target_folder = self.config['target_folder'] 287 | self.filer.default_folder = self.config['default_folder'] 288 | self.filer.original_move_folder = original_move_folder 289 | 290 | self.pdf_filer = PyPdfFiler(self.filer) 291 | if self.match_using_filename: 292 | print("Matching using filename as a fallback to pdf contents") 293 | self.pdf_filer.file_using_filename = True 294 | 295 | # ------------------------------ 296 | # Add all the folder names with associated keywords 297 | # to the filer object 298 | # ------------------------------ 299 | keyword_count = 0 300 | folder_count = 0 301 | if 'folders' in self.config: 302 | for folder, keywords in self.config['folders'].items(): 303 | folder_count +=1 304 | keyword_count += len(keywords) 305 | # Make sure keywords are lower-cased before adding 306 | keywords = [str(x).lower() for x in keywords] 307 | self.filer.add_folder_target(folder, keywords) 308 | 309 | print ("Filing of PDFs is enabled") 310 | print (" - %d target filing folders" % (folder_count)) 311 | print (" - %d keywords" % (keyword_count)) 312 | 313 | 314 | def _setup_external_tools(self): 315 | """ 316 | Instantiate the external tool wrappers with their config dicts 317 | """ 318 | 319 | self.gs = PyGs(self.config.get('ghostscript',{})) 320 | self.ts = PyTesseract(self.config.get('tesseract',{})) 321 | self.pdf = PyPdf(self.gs) 322 | self.preprocess = PyPreprocess(self.config.get('preprocess', {})) 323 | 324 | return 325 | 326 | def run_conversion(self, pdf_filename): 327 | """ 328 | Does the following: 329 | 330 | - Convert the PDF using GhostScript to TIFF and JPG 331 | - Run Tesseract on the TIFF to extract the text into HOCR (html) 332 | - Use PDF generator to overlay the text on the JPG and output a new PDF 333 | - Clean up temporary image files 334 | 335 | :param pdf_filename: Scanned PDF 336 | :type pdf_filename: string 337 | :returns: OCR'ed PDF 338 | :rtype: filename string 339 | """ 340 | print ("Starting conversion of %s" % pdf_filename) 341 | try: 342 | # Make the images for Tesseract 343 | img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename) 344 | 345 | fns = glob.glob(glob_img_filename) 346 | 347 | except Exception: 348 | raise 349 | 350 | try: 351 | # Preprocess 352 | if not self.skip_preprocess: 353 | preprocess_imagefilenames = self.preprocess.preprocess(fns) 354 | else: 355 | logging.info("Skipping preprocess step") 356 | preprocess_imagefilenames = fns 357 | # Run teserract 358 | self.ts.lang = self.lang 359 | hocr_filenames = self.ts.make_hocr_from_pnms(preprocess_imagefilenames) 360 | 361 | # Generate new pdf with overlayed text 362 | #ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename) 363 | ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename) 364 | 365 | finally: 366 | # Clean up the files 367 | time.sleep(1) 368 | if not self.debug: 369 | # Need to clean up the original image files before preprocessing 370 | if locals().has_key("fns"): # Have to check if this was set before exception raised 371 | logging.info("Cleaning up %s" % fns) 372 | self._clean_up_files(fns) 373 | 374 | if locals().has_key("preprocess_imagefilenames"): # Have to check if this was set before exception raised 375 | logging.info("Cleaning up %s" % preprocess_imagefilenames) 376 | self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs 377 | for ext in [".hocr", ".html", ".txt"]: 378 | fns_to_remove = [os.path.splitext(fn)[0]+ext for fn in preprocess_imagefilenames] 379 | logging.info("Cleaning up %s" % fns_to_remove) 380 | self._clean_up_files(fns_to_remove) # splat the hocr_filenames as it is a list of pairs 381 | # clean up the hocr input (jpg) and output (html) files 382 | #self._clean_up_files(itertools.chain(*hocr_filenames)) # splat the hocr_filenames as it is a list of pairs 383 | # Seems like newer tessearct > 3.03 is now creating .txt files with the OCR text?/? 384 | #self._clean_up_files([x[1].replace(".hocr", ".txt") for x in hocr_filenames]) 385 | 386 | 387 | print ("Completed conversion successfully to %s" % ocr_pdf_filename) 388 | return ocr_pdf_filename 389 | 390 | def file_converted_file(self, ocr_pdffilename, original_pdffilename): 391 | """ move the converted filename to its destiantion directory. Optionally also 392 | moves the original PDF. 393 | 394 | :param ocr_pdffilename: Converted PDF file 395 | :type ocr_pdffilename: filename string 396 | :param original_pdffilename: Original scanned PDF file 397 | :type original_pdffilename: filename string 398 | :returns: Target folder name 399 | "rtype: string 400 | """ 401 | filed_path = self.pdf_filer.move_to_matching_folder(ocr_pdffilename) 402 | print("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path))) 403 | 404 | tgt_path = self.pdf_filer.file_original(original_pdffilename) 405 | if tgt_path != original_pdffilename: 406 | print("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path))) 407 | return os.path.dirname(filed_path) 408 | 409 | 410 | def _send_email(self, infilename, outfilename, filing ): 411 | """ 412 | Send email using smtp 413 | """ 414 | print("Sending email status") 415 | from_addr = self.config["mail_from_addr"] 416 | to_addr_list = self.config["mail_to_list"] 417 | smtpserver = self.config["mail_smtp_server"] 418 | login = self.config["mail_smtp_login"] 419 | password = self.config["mail_smtp_password"] 420 | 421 | subject = "PyPDFOCR converted: %s" % (os.path.basename(outfilename)) 422 | header = 'From: %s\n' % login 423 | header += 'To: %s\n' % ','.join(to_addr_list) 424 | header += 'Subject: %s\n\n' % subject 425 | message = """ 426 | PyPDFOCR Conversion: 427 | -------------------- 428 | Original file: %s 429 | Converted file: %s 430 | Filing: %s 431 | """ % (infilename, outfilename, filing) 432 | message = header + message 433 | 434 | server = smtplib.SMTP(smtpserver) 435 | server.starttls() 436 | server.login(login,password) 437 | problems = server.sendmail(from_addr, to_addr_list, message) 438 | server.quit() 439 | 440 | def go(self, argv): 441 | """ 442 | The main entry point into PyPDFOCR 443 | 444 | #. Parses options 445 | #. If filing is enabled, call :func:`_setup_filing` 446 | #. If watch is enabled, start the watcher 447 | #. :func:`run_conversion` 448 | #. if filing is enabled, call :func:`file_converted_file` 449 | """ 450 | # Read the command line options 451 | self.get_options(argv) 452 | 453 | # Setup tesseract and ghostscript 454 | self._setup_external_tools() 455 | 456 | # Setup the pdf filing if enabled 457 | if self.enable_filing: 458 | self._setup_filing() 459 | 460 | # Do the actual conversion followed by optional filing and email 461 | if self.watch: 462 | while True: # Make sure the watcher doesn't terminate 463 | try: 464 | py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch')) 465 | for pdf_filename in py_watcher.start(): 466 | self._convert_and_file_email(pdf_filename) 467 | except KeyboardInterrupt: 468 | break 469 | except Exception as e: 470 | print traceback.print_exc(e) 471 | py_watcher.stop() 472 | 473 | else: 474 | self._convert_and_file_email(self.pdf_filename) 475 | 476 | def _convert_and_file_email(self, pdf_filename): 477 | """ 478 | Helper function to run the conversion, then do the optional filing, and optional emailing. 479 | """ 480 | ocr_pdffilename = self.run_conversion(pdf_filename) 481 | if self.enable_filing: 482 | filing = self.file_converted_file(ocr_pdffilename, pdf_filename) 483 | else: 484 | filing = "None" 485 | 486 | if self.enable_email: 487 | self._send_email(pdf_filename, ocr_pdffilename, filing) 488 | 489 | def main(): # pragma: no cover 490 | multiprocessing.freeze_support() 491 | script = PyPDFOCR() 492 | script.go(sys.argv[1:]) 493 | 494 | if __name__ == '__main__': 495 | main() 496 | 497 | 498 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python -*- 2 | a = Analysis(['src\\pypdfocr.py'], 3 | pathex=['C:\\Users\\Virantha Ekanayake\\dev\\pypdfocr\\src'], 4 | hiddenimports = [ 5 | 'reportlab.pdfbase._fontdata_enc_macexpert', 6 | 'reportlab.pdfbase._fontdata_enc_macroman', 7 | 'reportlab.pdfbase._fontdata_enc_pdfdoc', 8 | 'reportlab.pdfbase._fontdata_enc_standard', 9 | 'reportlab.pdfbase._fontdata_enc_symbol', 10 | 'reportlab.pdfbase._fontdata_enc_winansi', 11 | 'reportlab.pdfbase._fontdata_enc_zapfdingbats', 12 | 'reportlab.pdfbase._fontdata_widths_courier', 13 | 'reportlab.pdfbase._fontdata_widths_courierbold', 14 | 'reportlab.pdfbase._fontdata_widths_courierboldoblique', 15 | 'reportlab.pdfbase._fontdata_widths_courieroblique', 16 | 'reportlab.pdfbase._fontdata_widths_helvetica', 17 | 'reportlab.pdfbase._fontdata_widths_helveticabold', 18 | 'reportlab.pdfbase._fontdata_widths_helveticaboldoblique', 19 | 'reportlab.pdfbase._fontdata_widths_helveticaoblique', 20 | 'reportlab.pdfbase._fontdata_widths_symbol', 21 | 'reportlab.pdfbase._fontdata_widths_timesbold', 22 | 'reportlab.pdfbase._fontdata_widths_timesbolditalic', 23 | 'reportlab.pdfbase._fontdata_widths_timesitalic', 24 | 'reportlab.pdfbase._fontdata_widths_timesroman', 25 | 'reportlab.pdfbase._fontdata_widths_zapfdingbats'], 26 | hookspath=None, 27 | runtime_hooks=None) 28 | pyz = PYZ(a.pure) 29 | exe = EXE(pyz, 30 | a.scripts, 31 | a.binaries, 32 | a.zipfiles, 33 | a.datas, 34 | name='pypdfocr.exe', 35 | debug=False, 36 | strip=None, 37 | upx=True, 38 | console=True ) 39 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_filer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Virantha Ekanayake All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import abc 15 | import os, logging 16 | 17 | class PyFiler(object): 18 | """ Abstract base class for defining filing objects, whether you want to 19 | save to a file-system/directory structure or to something like Evernote 20 | 21 | """ 22 | __metaclass__ = abc.ABCMeta 23 | 24 | @abc.abstractmethod 25 | def move_to_matching_folder(self, filename): 26 | """ Move the file given by filename to the proper location. 27 | You will need to use :py:attr:`target_folder` and :py:attr:`folder_targets` 28 | to figure out what the proper destination is. If there is no matching location, 29 | then use :py:attr:`default_folder` 30 | 31 | :param filename: File to move 32 | :type filename: string 33 | :returns: Full path+filename of destination 34 | :rtype: string 35 | """ 36 | 37 | @abc.abstractmethod 38 | def file_original(self, original_filename): 39 | """ Move the original file given by filename to the proper location. 40 | You will need to use :py:attr:`original_move_target` 41 | 42 | :param original_filename: File to move 43 | :type original_filename: string 44 | :returns: Full path+filename of destination(original_filename if not moved) 45 | :rtype: string 46 | """ 47 | 48 | @abc.abstractmethod 49 | def add_folder_target(self, folder, keywords): 50 | """ Add a target folder for a list of keywords """ 51 | 52 | def _get_unique_filename_by_appending_version_integer(self, tgtfilename): 53 | if os.path.exists(tgtfilename): 54 | logging.info("File %s already exists in target directory %s" % (os.path.basename(tgtfilename), os.path.dirname(tgtfilename))) 55 | # First, try appending a _v1 to it 56 | num = 1 57 | dr, fn, ext = self._split_filename_dir_filename_ext(tgtfilename) 58 | tgtfilename = os.path.join(dr, "%s_%d%s" % (fn, num, ext)) 59 | while os.path.exists(tgtfilename): 60 | # Add an incrementing integer to the end of the filename and Loop until we find a new filename 61 | num += 1 62 | tgtfilename = os.path.join(dr, "%s_%d%s" % (fn, num, ext)) 63 | logging.info("Trying %s" % tgtfilename) 64 | logging.info("Using name %s instead for copying to target directory %s" % (os.path.basename(tgtfilename),os.path.dirname(tgtfilename ))) 65 | return tgtfilename 66 | 67 | def _split_filename_dir_filename_ext(self, filename): 68 | dr, fn = os.path.split(filename) # Get directory and filename 69 | 70 | # Silly me, forgot about the splitext function 71 | #fn_no_ext = fn.split('.')[0:-1] # Get the filename without ending extension 72 | #fn_no_ext = ''.join(fn_no_ext) 73 | #ext = fn.split('.')[-1] 74 | 75 | fn_no_ext, ext = os.path.splitext(fn) # Get filename plus extension 76 | return dr, fn_no_ext, ext 77 | 78 | def get_target_folder(self): 79 | return self._target_folder 80 | def set_target_folder(self, target_folder): 81 | self._target_folder = target_folder 82 | 83 | def get_default_folder(self): 84 | return self._default_folder 85 | def set_default_folder(self, default_folder): 86 | self._default_folder = default_folder 87 | 88 | def get_original_move_folder(self): 89 | return self._original_move_folder 90 | def set_original_move_folder(self, original_move_folder): 91 | self._original_move_folder = original_move_folder 92 | 93 | def get_folder_targets(self): 94 | return self._folder_targets 95 | def set_folder_targets(self, folder_targets): 96 | self._folder_targets = folder_targets 97 | 98 | target_folder = property (get_target_folder, set_target_folder) 99 | default_folder = property (get_default_folder, set_default_folder) 100 | original_move_folder = property(get_original_move_folder, set_original_move_folder) 101 | 102 | folder_targets = property(get_folder_targets, set_folder_targets) 103 | """ Data structure for mapping a keyword to a folder target. Usually just a dict, and new mappings 104 | are added from :py:func:`add_folder_target` 105 | """ 106 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_filer_dirs.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import logging 16 | import os 17 | import shutil 18 | 19 | from pypdfocr_filer import PyFiler 20 | 21 | """ 22 | Implementation of a filer class 23 | -> Works on file system/directory structure 24 | """ 25 | class PyFilerDirs(PyFiler): 26 | 27 | def __init__(self): 28 | self.target_folder = None 29 | self.default_folder = None 30 | self.original_move_folder = None 31 | self.folder_targets = {} 32 | 33 | def add_folder_target(self, folder, keywords): 34 | assert folder not in self.folder_targets, "Target folder already defined! (%s)" % (folder) 35 | self.folder_targets[folder] = keywords 36 | 37 | def file_original(self, original_filename): 38 | if not self.original_move_folder: 39 | logging.debug("Leaving original untouched") 40 | return original_filename 41 | 42 | tgt_path = self.original_move_folder 43 | logging.debug("Moving original %s to %s" % (original_filename, tgt_path)) 44 | tgtfilename = os.path.join(tgt_path, os.path.basename(original_filename)) 45 | tgtfilename = self._get_unique_filename_by_appending_version_integer(tgtfilename) 46 | 47 | shutil.move(original_filename, tgtfilename) 48 | return tgtfilename 49 | 50 | def move_to_matching_folder(self, filename, foldername): 51 | assert self.target_folder != None 52 | assert self.default_folder != None 53 | 54 | if not foldername: 55 | logging.info("[DEFAULT] %s --> %s" % (filename, self.default_folder)) 56 | tgt_path = os.path.join(self.target_folder, self.default_folder) 57 | else: 58 | logging.info("[MATCH] %s --> %s" % (filename, foldername)) 59 | tgt_path = os.path.join(self.target_folder,foldername) 60 | 61 | if not os.path.exists(tgt_path): 62 | logging.debug("Making path %s" % tgt_path) 63 | os.makedirs(tgt_path) 64 | 65 | logging.debug("Moving %s to %s" % (filename, tgt_path)) 66 | tgtfilename = os.path.join(tgt_path, os.path.basename(filename)) 67 | tgtfilename = self._get_unique_filename_by_appending_version_integer(tgtfilename) 68 | 69 | shutil.move(filename, tgtfilename) 70 | return tgtfilename 71 | 72 | 73 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_filer_evernote.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import logging 16 | import os 17 | import shutil 18 | import hashlib 19 | import time 20 | import sys 21 | 22 | from pypdfocr_filer import PyFiler 23 | 24 | import functools 25 | 26 | from evernote.api.client import EvernoteClient 27 | import evernote.edam.type.ttypes as Types 28 | import evernote.edam.userstore.constants as UserStoreConstants 29 | from evernote.edam.error.ttypes import EDAMUserException 30 | from evernote.edam.error.ttypes import EDAMSystemException 31 | from evernote.edam.error.ttypes import EDAMNotFoundException 32 | from evernote.edam.error.ttypes import EDAMErrorCode 33 | 34 | 35 | """ 36 | Implementation of a filer class 37 | -> Files documents to Evernote notebooks (each document becomes a new note) 38 | """ 39 | class en_handle(object): 40 | """ Generic exception handler for Evernote actions 41 | """ 42 | def __init__(self, f): 43 | # f is the method being decorated, so save it so we can call it later! 44 | self.f = f 45 | functools.update_wrapper(self, f) 46 | 47 | def __get__(self, instance, owner): 48 | # Save a ptr to the object being decorated 49 | self.cls = owner 50 | self.obj = instance 51 | return self.__call__ 52 | 53 | def __call__(self, *args, **kwargs): 54 | # The actual meat of the decorator 55 | 56 | # Call the original method being decorated 57 | retryCount = 3 58 | retry_auth = False 59 | msg = "EVERNOTE ERROR: %s" 60 | r = None 61 | while retryCount > 0: 62 | try: 63 | retryCount -= 1 64 | if retry_auth: 65 | logging.debug("Retrying") 66 | self.obj._connect_to_evernote(self.obj.dictUserInfo) 67 | retry_auth = False 68 | logging.debug("executing user function") 69 | r = self.f.__call__(self.obj, *args, **kwargs) 70 | break 71 | except EDAMUserException as e: 72 | err = e.errorCode 73 | c = EDAMErrorCode 74 | if err == c.AUTH_EXPIRED or err == c.DATA_REQUIRED: 75 | logging.debug(msg % "Authorization expired, retrying...") 76 | retry_auth = True 77 | time.sleep(3) 78 | else: 79 | logging.debug(msg % ("Unhandled error %s:%s" % (c._VALUES_TO_NAMES[err], e.parameter))) 80 | return r 81 | 82 | 83 | 84 | class PyFilerEvernote(PyFiler): 85 | 86 | def get_target_folder(self): 87 | return self._target_folder 88 | def set_target_folder (self, target_folder): 89 | """ Override this to make sure we only have the basename""" 90 | print("Setting target_folder %s" % target_folder) 91 | if target_folder: 92 | self._target_folder = os.path.basename(target_folder) 93 | else: 94 | self._target_folder = target_folder 95 | 96 | target_folder = property(get_target_folder, set_target_folder) 97 | 98 | def get_default_folder (self): 99 | """ Override this to make sure we only have the basename""" 100 | return self._default_folder 101 | 102 | def set_default_folder (self, default_folder): 103 | """ Override this to make sure we only have the basename""" 104 | if default_folder: 105 | self._default_folder = os.path.basename(default_folder) 106 | else: 107 | self._default_folder = default_folder 108 | 109 | default_folder = property(get_default_folder, set_default_folder) 110 | 111 | def __init__(self, dev_token): 112 | self.target_folder = None 113 | self.default_folder = None 114 | self.original_move_folder = None 115 | self.folder_targets = {} 116 | self.dictUserInfo = { 'dev_token': dev_token } 117 | self._connect_to_evernote(self.dictUserInfo) 118 | 119 | def _connect_to_evernote(self, dictUserInfo): 120 | """ 121 | Establish a connection to evernote and authenticate. 122 | 123 | :param dictUserInfo: Dict of user info like user/passwrod. For now, just the dev token 124 | :returns success: Return wheter connection succeeded 125 | :rtype bool: 126 | """ 127 | print("Authenticating to Evernote") 128 | dev_token = dictUserInfo['dev_token'] 129 | logging.debug("Authenticating using token %s" % dev_token) 130 | user = None 131 | try: 132 | self.client = EvernoteClient(token=dev_token, sandbox=False) 133 | self.user_store = self.client.get_user_store() 134 | user = self.user_store.getUser() 135 | except EDAMUserException as e: 136 | err = e.errorCode 137 | print("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.parameter)) 138 | except EDAMSystemException as e: 139 | err = e.errorCode 140 | print("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.message)) 141 | sys.exit(-1) 142 | 143 | if user: 144 | print("Authenticated to evernote as user %s" % user.username) 145 | return True 146 | 147 | def add_folder_target(self, folder, keywords): 148 | assert folder not in self.folder_targets, "Target folder already defined! (%s)" % (folder) 149 | self.folder_targets[folder] = keywords 150 | 151 | def file_original(self, original_filename): 152 | """ 153 | Just file it to the local file system (don't upload to evernote) 154 | """ 155 | if not self.original_move_folder: 156 | logging.debug("Leaving original untouched") 157 | return original_filename 158 | 159 | tgt_path = self.original_move_folder 160 | logging.debug("Moving original %s to %s" % (original_filename, tgt_path)) 161 | tgtfilename = os.path.join(tgt_path, os.path.basename(original_filename)) 162 | tgtfilename = self._get_unique_filename_by_appending_version_integer(tgtfilename) 163 | 164 | shutil.move(original_filename, tgtfilename) 165 | return tgtfilename 166 | 167 | @en_handle 168 | def _get_notebooks(self): 169 | note_store = self.client.get_note_store() 170 | notebooks = note_store.listNotebooks() 171 | return {n.name:n for n in notebooks} 172 | 173 | @en_handle 174 | def _create_notebook(self, notebook): 175 | note_store = self.client.get_note_store() 176 | return note_store.createNotebook(notebook) 177 | 178 | def _update_notebook(self, notebook): 179 | note_store = self.client.get_note_store() 180 | note_store.updateNotebook(notebook) 181 | return 182 | 183 | @en_handle 184 | def _check_and_make_notebook(self, notebook_name): 185 | """ 186 | Weird. 187 | :returns notebook: New or existing notebook object 188 | :rtype Types.Notebook: 189 | """ 190 | # Get the noteStore 191 | #note_store = self.client.get_note_store() 192 | #notebooks = note_store.listNotebooks() 193 | #notebooks = {n.name:n for n in notebooks} 194 | notebooks = self._get_notebooks() 195 | if notebook_name in notebooks: 196 | notebook = notebooks[notebook_name] 197 | if notebook.stack != self.target_folder: 198 | notebook.stack = self.target_folder 199 | self._update_notebook(notebook) 200 | return notebook 201 | else: 202 | # Need to create a new notebook 203 | notebook = Types.Notebook() 204 | notebook.name = notebook_name 205 | notebook.stack = self.target_folder 206 | notebook = self._create_notebook(notebook) 207 | #notebook = note_store.createNotebook(notebook) 208 | return notebook 209 | 210 | @en_handle 211 | def _create_evernote_note(self, notebook, filename): 212 | # Create the new note 213 | note = Types.Note() 214 | note.title = os.path.basename(filename) 215 | note.notebookGuid = notebook.guid 216 | note.content = '' 217 | note.content += 'Uploaded by PyPDFOCR
' 218 | 219 | 220 | logging.debug("Loading PDF") 221 | md5 = hashlib.md5() 222 | with open(filename,'rb') as f: 223 | pdf_bytes = f.read() 224 | 225 | logging.debug("Calculating md5 checksum of pdf") 226 | md5.update(pdf_bytes) 227 | md5hash = md5.hexdigest() 228 | 229 | logging.debug("Uploading note") 230 | 231 | # Create the Data type for evernote that goes into a resource 232 | pdf_data = Types.Data() 233 | pdf_data.bodyHash = md5hash 234 | pdf_data.size = len(pdf_bytes) 235 | pdf_data.body = pdf_bytes 236 | 237 | # Add a link in the evernote boy for this content 238 | link = '' % md5hash 239 | logging.debug(link) 240 | note.content += link 241 | note.content += '
' 242 | 243 | resource_list = [] 244 | pdf_resource = Types.Resource() 245 | pdf_resource.data = pdf_data 246 | pdf_resource.mime = "application/pdf" 247 | # TODO: Enable filename 248 | # Make a attributes for this resource 249 | pdf_resource.attributes = Types.ResourceAttributes() 250 | pdf_resource.attributes.fileName = os.path.basename(filename) 251 | resource_list.append(pdf_resource) 252 | 253 | note.resources = resource_list 254 | 255 | return note 256 | 257 | 258 | def move_to_matching_folder(self, filename, foldername): 259 | """ 260 | Use the evernote API to create a new note: 261 | 262 | #. Make the notebook if it doesn't exist (:func:`_check_and_make_notebook`) 263 | #. Create the note (:func:`_create_evernote_note`) 264 | #. Upload note using API 265 | 266 | """ 267 | assert self.target_folder != None 268 | assert self.default_folder != None 269 | 270 | if not foldername: 271 | logging.info("[DEFAULT] %s --> %s" % (filename, self.default_folder)) 272 | foldername = self.default_folder 273 | else: 274 | logging.info("[MATCH] %s --> %s" % (filename, foldername)) 275 | 276 | # Check if the evernote notebook exists 277 | print ("Checking for notebook named %s" % foldername) 278 | notebook = self._check_and_make_notebook(foldername) 279 | print("Uploading %s to %s" % (filename, foldername)) 280 | 281 | note = self._create_evernote_note(notebook, filename) 282 | 283 | # Store the note in evernote 284 | note_store = self.client.get_note_store() 285 | note = note_store.createNote(note) 286 | os.remove(filename) 287 | 288 | return "%s/%s" % (notebook.name, note.title) 289 | 290 | 291 | if __name__ == '__main__': # pragma: no cover 292 | logging.basicConfig(level=logging.DEBUG, format='%(message)s') 293 | logging.basicConfig(level=logging.INFO, format='%(message)s') 294 | p = PyFilerEvernote() 295 | p.add_folder_target("auto", ['dmv']) 296 | p.target_folder = 'myuploads' 297 | p.default_folder = 'default' 298 | p.original_move_folder = None 299 | 300 | p.move_to_matching_folder('../dmv/dmv_ocr.pdf', 'auto') 301 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_gs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | # Copyright 2013 Virantha Ekanayake All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | 19 | """ 20 | Wrap ghostscript calls. Yes, this is ugly. 21 | """ 22 | 23 | import subprocess 24 | import sys, os 25 | import logging 26 | import glob 27 | 28 | def error(text): 29 | print("ERROR: %s" % text) 30 | exit(-1) 31 | 32 | class PyGs(object): 33 | """Class to wrap all the ghostscript calls""" 34 | 35 | def __init__(self, config): 36 | self.msgs = { 37 | 'GS_FAILED': 'Ghostscript execution failed', 38 | 'GS_MISSING_PDF': 'Cannot find specified pdf file', 39 | 'GS_OUTDATED': 'Your Ghostscript version is probably out of date. Please upgrade to the latest version', 40 | 'GS_MISSING_BINARY': 'Could not find Ghostscript in the usual place; please specify it using your config file', 41 | } 42 | self.threads = config.get('threads',4) 43 | 44 | if "binary" in config: # Override location of binary 45 | binary = config['binary'] 46 | if os.name == 'nt': 47 | binary = '"%s"' % binary 48 | binary = binary.replace("\\", "\\\\") 49 | logging.info("Setting location for executable to %s" % (binary)) 50 | else: 51 | if str(os.name) == 'nt': 52 | win_binary = self._find_windows_gs() 53 | binary = '"%s"' % win_binary 54 | logging.info("Using Ghostscript: %s" % binary) 55 | else: 56 | binary = "gs" 57 | self.binary = binary 58 | 59 | #self.tiff_dpi = 300 60 | self.output_dpi = 300 61 | self.greyscale = True 62 | # Tiff is used for the ocr, so just fix it at 300dpi 63 | # The other formats will be used to create the final OCR'ed image, so determine 64 | # the DPI by using pdfimages if available, o/w default to 200 65 | self.gs_options = {'tiff': ['tiff', ['-sDEVICE=tiff24nc','-r%(dpi)s' ]], 66 | 'jpg': ['jpg', ['-sDEVICE=jpeg','-dJPEGQ=75', '-r%(dpi)s']], 67 | 'jpggrey': ['jpg', ['-sDEVICE=jpeggray', '-dJPEGQ=75', '-r%(dpi)s']], 68 | 'png': ['png', ['-sDEVICE=png16m', '-r%(dpi)s']], 69 | 'pnggrey': ['png', ['-sDEVICE=pngmono', '-r%(dpi)s']], 70 | 'tifflzw': ['tiff', ['-sDEVICE=tifflzw', '-r%(dpi)s']], 71 | 'tiffg4': ['tiff', ['-sDEVICE=tiffg4', '-r%(dpi)s']], 72 | 'pnm': ['pnm', ['-sDEVICE=pnmraw', '-r%(dpi)s']], 73 | 'pgm': ['pgm', ['-sDEVICE=pgm', '-r%(dpi)s']], 74 | } 75 | 76 | def _find_windows_gs(self): 77 | """ 78 | Searches through the Windows program files directories to find Ghostscript. 79 | If it finds multiple versions, it does a naive sort for now to find the most 80 | recent. 81 | 82 | :rval: The ghostscript binary location 83 | 84 | """ 85 | windirs = ["c:\\Program Files\\gs", "c:\\Program Files (x86)\\gs"] 86 | gs = None 87 | for d in windirs: 88 | if not os.path.exists(d): 89 | continue 90 | cwd = os.getcwd() 91 | os.chdir(d) 92 | listing = os.listdir('.') 93 | 94 | # Find all possible gs* sub-directories 95 | listing = [x for x in listing if x.startswith('gs')] 96 | 97 | # TODO: Make this a natural sort 98 | listing.sort(reverse=True) 99 | for bindir in listing: 100 | binpath = os.path.join(bindir,'bin') 101 | if not os.path.exists(binpath): continue 102 | os.chdir(binpath) 103 | # Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version) 104 | gswin = glob.glob('gswin*c.exe') 105 | if len(gswin) == 0: 106 | continue 107 | gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?) 108 | os.chdir(cwd) 109 | return gs 110 | 111 | if not gs: 112 | error(self.msgs['GS_MISSING_BINARY']) 113 | 114 | def _warn(self, msg): 115 | print("WARNING: %s" % msg) 116 | 117 | def _get_dpi(self, pdf_filename): 118 | if not os.path.exists(pdf_filename): 119 | error(self.msgs['GS_MISSING_PDF'] + " %s" % pdf_filename) 120 | 121 | cmd = 'pdfimages -list "%s"' % pdf_filename 122 | logging.info("Running pdfimages to figure out DPI...") 123 | logging.debug(cmd) 124 | try: 125 | out = subprocess.check_output(cmd, shell=True) 126 | except subprocess.CalledProcessError as e: 127 | self._warn ("Could not execute pdfimages to calculate DPI (try installing xpdf or poppler?), so defaulting to %sdpi" % self.output_dpi) 128 | return 129 | 130 | # Need the second line of output 131 | # Make sure it exists (in case this is an empty pdf) 132 | results = out.splitlines() 133 | if len(results)<3: 134 | self._warn("Empty pdf, cannot determine dpi using pdfimages") 135 | return 136 | results = results[2] 137 | logging.debug(results) 138 | results = results.split() 139 | if(results[2] != 'image'): 140 | self._warn("Could not understand output of pdfimages, please rerun with -d option and file an issue at http://github.com/virantha/pypdfocr/issues") 141 | return 142 | x_pt, y_pt, greyscale = int(results[3]), int(results[4]), results[5]=='gray' 143 | self.greyscale = greyscale 144 | 145 | # Now, run imagemagick identify to get pdf width/height/density 146 | cmd = 'identify -format "%%w %%x %%h %%y\n" "%s"' % pdf_filename 147 | try: 148 | out = subprocess.check_output(cmd, shell=True) 149 | results = out.splitlines()[0] 150 | results = results.replace("Undefined", "") 151 | width, xdensity, height, ydensity = [float(x) for x in results.split()] 152 | xdpi = round(x_pt/width*xdensity) 153 | ydpi = round(y_pt/height*ydensity) 154 | self.output_dpi = xdpi 155 | if ydpi>xdpi: self.output_dpi = ydpi 156 | if self.output_dpi < 300: self.output_dpi = 300 157 | if abs(xdpi-ydpi) > xdpi*.05: # Make sure the two dpi's are within 5% 158 | self._warn("X-dpi is %d, Y-dpi is %d, defaulting to %d" % (xdpi, ydpi, self.output_dpi)) 159 | else: 160 | print("Using %d DPI" % self.output_dpi) 161 | 162 | 163 | except Exception as e: 164 | logging.debug(str(e)) 165 | self._warn ("Could not execute identify to calculate DPI (try installing imagemagick?), so defaulting to %sdpi" % self.output_dpi) 166 | return 167 | 168 | 169 | 170 | def _run_gs(self, options, output_filename, pdf_filename): 171 | try: 172 | cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename) 173 | logging.info(cmd) 174 | out = subprocess.check_output(cmd, shell=True) 175 | 176 | except subprocess.CalledProcessError as e: 177 | print e.output 178 | if "undefined in .getdeviceparams" in e.output: 179 | error(self.msgs['GS_OUTDATED']) 180 | else: 181 | error (self.msgs['GS_FAILED']) 182 | 183 | 184 | def make_img_from_pdf(self, pdf_filename): 185 | self._get_dpi(pdf_filename) # No need to bother anymore 186 | 187 | if not os.path.exists(pdf_filename): 188 | error(self.msgs['GS_MISSING_PDF'] + " %s" % pdf_filename) 189 | 190 | filename, filext = os.path.splitext(pdf_filename) 191 | 192 | 193 | # Create ancillary jpeg files to use later to calculate image dpi etc 194 | # We no longer use these for the final image. Instead the text is merged 195 | # directly with the original PDF. Yay! 196 | if self.greyscale: 197 | self.img_format = 'jpggrey' 198 | #self.img_format = 'pnggrey' 199 | logging.info("Detected greyscale") 200 | else: 201 | self.img_format = 'jpg' 202 | #self.img_format = 'png' 203 | logging.info("Detected color") 204 | 205 | self.img_file_ext = self.gs_options[self.img_format][0] 206 | 207 | # The possible output files glob 208 | globable_filename = '%s_*.%s' % (filename, self.img_file_ext) 209 | # Delete any img files already existing 210 | for fn in glob.glob(globable_filename): 211 | os.remove(fn) 212 | 213 | options = ' '.join(self.gs_options[self.img_format][1]) % {'dpi':self.output_dpi} 214 | output_filename = '%s_%%d.%s' % (filename, self.img_file_ext) 215 | self._run_gs(options, output_filename, pdf_filename) 216 | for fn in glob.glob(globable_filename): 217 | logging.info("Created image %s" % fn) 218 | return (self.output_dpi, globable_filename) 219 | 220 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_interrupts.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2015 Virantha Ekanayake All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import signal, logging 17 | 18 | """ 19 | Used for handling keyboard interrupts in Pools. 20 | Basically, throw an Exception when we see the ctrl-c, so that it actaully is propagated to the parent class 21 | """ 22 | 23 | class KeyboardInterruptError(Exception): pass 24 | 25 | def signal_handle(_signal, frame): 26 | logging.debug("Stopping job") 27 | raise KeyboardInterruptError() 28 | 29 | 30 | def init_worker(): 31 | """ used for catching ctrl-c 32 | """ 33 | signal.signal(signal.SIGINT, signal_handle) 34 | 35 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_multiprocessing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys, os, multiprocessing.forking 17 | import logging 18 | 19 | """ Special work-around to support multiprocessing and pyinstaller --onefile on windows systms 20 | 21 | https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing 22 | """ 23 | 24 | import multiprocessing.forking as forking 25 | import os 26 | import sys 27 | 28 | class _Popen(multiprocessing.forking.Popen): 29 | def __init__(self, *args, **kw): 30 | if hasattr(sys, 'frozen'): 31 | # We have to set original _MEIPASS2 value from sys._MEIPASS 32 | # to get --onefile mode working. 33 | os.putenv('_MEIPASS2', sys._MEIPASS) 34 | try: 35 | super(_Popen, self).__init__(*args, **kw) 36 | finally: 37 | if hasattr(sys, 'frozen'): 38 | # On some platforms (e.g. AIX) 'os.unsetenv()' is not 39 | # available. In those cases we cannot delete the variable 40 | # but only set it to the empty string. The bootloader 41 | # can handle this case. 42 | if hasattr(os, 'unsetenv'): 43 | os.unsetenv('_MEIPASS2') 44 | else: 45 | os.putenv('_MEIPASS2', '') 46 | 47 | forking.Popen = _Popen 48 | 49 | #class Process(multiprocessing.Process): 50 | #_Popen = _Popen 51 | 52 | # ... 53 | 54 | if __name__ == '__main__': 55 | # On Windows calling this function is necessary. 56 | multiprocessing.freeze_support() 57 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_pdf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Following code is adapted and modified from hocr-pdf.py released under 18 | # Apache License, Version 2.0 available at 19 | # https://code.google.com/p/hocr-tools/source/browse/hocr-pdf 20 | # - Code was improved to allow multi-page hocr files 21 | """ 22 | Wrap pdf generation and text addition code 23 | """ 24 | 25 | from optparse import OptionParser 26 | import sys, os 27 | import re 28 | import logging 29 | import shutil 30 | import time 31 | import tempfile 32 | import glob 33 | 34 | import cStringIO 35 | import base64 36 | import zlib 37 | import math 38 | 39 | from cgi import escape 40 | # Pkg to read multiple image tiffs 41 | from PIL import Image 42 | from reportlab.pdfgen.canvas import Canvas 43 | from reportlab.pdfbase import pdfmetrics 44 | from reportlab.pdfbase.ttfonts import TTFont 45 | from xml.etree.ElementTree import ElementTree, ParseError 46 | import xml.etree 47 | 48 | # Import Pypdf2 49 | from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter, utils 50 | 51 | from reportlab.lib.styles import getSampleStyleSheet 52 | from reportlab.lib.enums import TA_LEFT 53 | from reportlab.platypus.paragraph import Paragraph 54 | 55 | from pypdfocr_util import Retry 56 | from functools import partial 57 | 58 | class RotatedPara(Paragraph): 59 | """ 60 | Used for rotating text, since the low-level rotate method in textobject's don't seem to 61 | do anything 62 | """ 63 | 64 | def __init__ (self, text, style, angle): 65 | Paragraph.__init__(self, text, style) 66 | self.angle = angle 67 | 68 | def draw(self): 69 | self.canv.saveState() 70 | self.canv.translate(0,0) 71 | self.canv.rotate(self.angle) 72 | Paragraph.draw(self) 73 | self.canv.restoreState() 74 | def beginText(self, x, y): 75 | t = self.canv.beginText(x,y) 76 | t.setTextRenderMode(3) # Set to zero if you want the text to appear 77 | #t.setTextRenderMode(0) # Set to zero if you want the text to appear 78 | return t 79 | 80 | class PyPdf(object): 81 | """Class to create pdfs from images""" 82 | # Some regexes to compile once 83 | regex_bbox = re.compile('bbox((\s+\d+){4})') 84 | regex_baseline = re.compile('baseline((\s+[\d\.\-]+){2})') 85 | regex_fontspec = re.compile('x_font\s+(.+);\s+x_fsize\s+(\d+)') 86 | regex_textangle = re.compile('textangle\s+(\d+)') 87 | 88 | def __init__(self, gs): 89 | self.gs = gs # Pointer to ghostscript object 90 | 91 | 92 | def get_transform(self, rotation, tx, ty): 93 | # Code taken from here: 94 | # http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824 95 | # Unclear why PyPDF2 builtin page rotation functions don't work 96 | translation = [[1, 0, 0], 97 | [0, 1, 0], 98 | [-tx,-ty,1]] 99 | rotation = math.radians(rotation) 100 | rotating = [[math.cos(rotation), math.sin(rotation),0], 101 | [-math.sin(rotation),math.cos(rotation), 0], 102 | [0, 0, 1]] 103 | rtranslation = [[1, 0, 0], 104 | [0, 1, 0], 105 | [tx,ty,1]] 106 | ctm = utils.matrixMultiply(translation, rotating) 107 | ctm = utils.matrixMultiply(ctm, rtranslation) 108 | 109 | return ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1] 110 | 111 | def mergeRotateAroundPointPage(self,page, page2, rotation, tx, ty): 112 | # Code taken from here: 113 | # http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824 114 | # Unclear why PyPDF2 builtin page rotation functions don't work 115 | translation = [[1, 0, 0], 116 | [0, 1, 0], 117 | [-tx,-ty,1]] 118 | rotation = math.radians(rotation) 119 | rotating = [[math.cos(rotation), math.sin(rotation),0], 120 | [-math.sin(rotation),math.cos(rotation), 0], 121 | [0, 0, 1]] 122 | rtranslation = [[1, 0, 0], 123 | [0, 1, 0], 124 | [tx,ty,1]] 125 | ctm = utils.matrixMultiply(translation, rotating) 126 | ctm = utils.matrixMultiply(ctm, rtranslation) 127 | 128 | return page.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1], 129 | ctm[1][0], ctm[1][1], 130 | ctm[2][0], ctm[2][1]]) 131 | 132 | def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename): 133 | 134 | logging.debug("Going to overlay following files onto %s" % orig_pdf_filename) 135 | # Sort the hocr_filenames into natural keys! 136 | hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] )) 137 | logging.debug(hocr_filenames) 138 | 139 | pdf_dir, pdf_basename = os.path.split(orig_pdf_filename) 140 | basename = os.path.splitext(pdf_basename)[0] 141 | pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename)) 142 | 143 | text_pdf_filenames = [] 144 | for img_filename, hocr_filename in hocr_filenames: 145 | text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename) 146 | logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename)) 147 | text_pdf_filenames.append(text_pdf_filename) 148 | 149 | # Now, concatenate this text_pdfs into one single file. 150 | # This is a hack to save memory/running time when we have to do the actual merge with a writer 151 | 152 | all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename)) 153 | merger = PdfFileMerger() 154 | for text_pdf_filename in text_pdf_filenames: 155 | merger.append(PdfFileReader(file(text_pdf_filename, 'rb'))) 156 | merger.write(all_text_filename) 157 | merger.close() 158 | del merger 159 | 160 | 161 | writer = PdfFileWriter() 162 | orig = open(orig_pdf_filename, 'rb') 163 | text_file = open(all_text_filename, 'rb') 164 | 165 | for orig_pg, text_pg in zip(self.iter_pdf_page(orig), self.iter_pdf_page(text_file)): 166 | orig_pg = self._get_merged_single_page(orig_pg, text_pg) 167 | writer.addPage(orig_pg) 168 | 169 | with open(pdf_filename, 'wb') as f: 170 | # Flush out this page merge so we can close the text_file 171 | writer.write(f) 172 | 173 | orig.close() 174 | text_file.close() 175 | 176 | # Windows sometimes locks the temp text file for no reason, so we need to retry a few times to delete 177 | for fn in text_pdf_filenames: 178 | #os.remove(fn) 179 | Retry(partial(os.remove, fn), tries=10, pause=3).call_with_retry() 180 | 181 | os.remove(all_text_filename) 182 | logging.info("Created OCR'ed pdf as %s" % (pdf_filename)) 183 | 184 | return pdf_filename 185 | 186 | def _get_merged_single_page(self, original_page, ocr_text_page): 187 | """ 188 | Take two page objects, rotate the text page if necessary, and return the merged page 189 | """ 190 | orig_rotation_angle = int(original_page.get('/Rotate', 0)) 191 | 192 | if orig_rotation_angle != 0: 193 | logging.info("Original Rotation: %s" % orig_rotation_angle) 194 | self.mergeRotateAroundPointPage(original_page, ocr_text_page, orig_rotation_angle, ocr_text_page.mediaBox.getWidth()/2, ocr_text_page.mediaBox.getWidth()/2) 195 | # None of these commands worked for me: 196 | #orig_pg.rotateCounterClockwise(orig_rotation_angle) 197 | #orig_pg.mergeRotatedPage(text_pg,orig_rotation_angle) 198 | else: 199 | original_page.mergePage(ocr_text_page) 200 | original_page.compressContentStreams() 201 | return original_page 202 | 203 | 204 | def _get_img_dims(self, img_filename): 205 | """ 206 | :rval: (width, height, dpi) 207 | """ 208 | img = Image.open(img_filename) 209 | w,h = img.size 210 | dpi = img.info['dpi'] 211 | width = w*72.0/dpi[0] 212 | height = h*72.0/dpi[1] 213 | del img 214 | return (width, height, dpi) 215 | 216 | def overlay_hocr_page(self, dpi, hocr_filename, img_filename): 217 | hocr_dir, hocr_basename = os.path.split(hocr_filename) 218 | img_dir, img_basename = os.path.split(img_filename) 219 | logging.debug("hocr_filename:%s, hocr_dir:%s, hocr_basename:%s" % (hocr_filename, hocr_dir, hocr_basename)) 220 | assert(img_dir == hocr_dir) 221 | 222 | #basename = hocr_basename.split('.')[0] 223 | basename = os.path.splitext(hocr_basename)[0] 224 | pdf_filename = os.path.join("text_%s_ocr.pdf" % (basename)) 225 | 226 | # Switch to the hocr directory to make this easier 227 | cwd = os.getcwd() 228 | if hocr_dir != "": 229 | os.chdir(hocr_dir) 230 | 231 | with open(pdf_filename, "wb") as f: 232 | logging.info("Overlaying hocr and creating text pdf %s" % pdf_filename) 233 | pdf = Canvas(f, pageCompression=1) 234 | pdf.setCreator('pypdfocr') 235 | pdf.setTitle(os.path.basename(hocr_filename)) 236 | pdf.setPageCompression(1) 237 | 238 | width, height, dpi_jpg = self._get_img_dims(img_basename) 239 | pdf.setPageSize((width,height)) 240 | logging.info("Page width=%f, height=%f" % (width, height)) 241 | 242 | pg_num = 1 243 | 244 | logging.info("Adding text to page %s" % pdf_filename) 245 | self.add_text_layer(pdf,hocr_basename,pg_num,height,dpi) 246 | pdf.showPage() 247 | pdf.save() 248 | 249 | os.chdir(cwd) 250 | return os.path.join(hocr_dir, pdf_filename) 251 | 252 | def iter_pdf_page(self, f): 253 | reader = PdfFileReader(f) 254 | for pgnum in range(reader.getNumPages()): 255 | pg = reader.getPage(pgnum) 256 | yield pg 257 | 258 | def _atoi(self,text): 259 | return int(text) if text.isdigit() else text 260 | 261 | def natural_keys(self, text): 262 | ''' 263 | alist.sort(key=natural_keys) sorts in human order 264 | http://nedbatchelder.com/blog/200712/human_sorting.html 265 | (See Toothy's implementation in the comments) 266 | ''' 267 | return [ self._atoi(c) for c in re.split('(\d+)', text) ] 268 | 269 | def add_text_layer(self,pdf, hocrfile, page_num,height, dpi): 270 | """Draw an invisible text layer for OCR data. 271 | 272 | This function really needs to get cleaned up 273 | 274 | """ 275 | hocr = ElementTree() 276 | try: 277 | # It's possible tesseract has failed and written garbage to this hocr file, so we need to catch any exceptions 278 | hocr.parse(hocrfile) 279 | except Exception: 280 | logging.info("Error loading hocr, not adding any text") 281 | return 282 | 283 | logging.debug(xml.etree.ElementTree.tostring(hocr.getroot())) 284 | for c in hocr.getroot(): # Find the tag 285 | if c.tag != 'body': 286 | continue 287 | for page in c: # Each child in the body is a page tag 288 | if (page.attrib['class'] != "ocr_page"): 289 | assert ("Why is this hocr not paging properly??") 290 | if page.attrib['id'] == 'page_%d' %(page_num): 291 | break 292 | 293 | for line in page.findall(".//{http://www.w3.org/1999/xhtml}span"): 294 | #for line in page.findall(".//span"): 295 | if line.attrib['class'] != 'ocr_line': 296 | continue 297 | linebox = self.regex_bbox.search(line.attrib['title']).group(1).split() 298 | textangle = self.regex_textangle.search(line.attrib['title']) 299 | if textangle: 300 | textangle = self._atoi(textangle.group(1)) 301 | else: 302 | textangle = 0 303 | 304 | try: 305 | baseline = self.regex_baseline.search(line.attrib['title']).group(1).split() 306 | except AttributeError: 307 | baseline = [ 0, 0 ] 308 | 309 | linebox = [float(i) for i in linebox] 310 | baseline = [float(i) for i in baseline] 311 | 312 | for word in line: 313 | if word.attrib['class'] != 'ocrx_word': 314 | continue 315 | word_text = [] 316 | for child in word.iter(): 317 | if child.text: 318 | word_text.append(child.text) 319 | word.text = ' '.join(word_text) 320 | if word.text is None: 321 | continue 322 | logging.debug("word: %s, angle: %d" % ( word.text.strip(), textangle)) 323 | 324 | 325 | box = self.regex_bbox.search(word.attrib['title']).group(1).split() 326 | #b = self.polyval(baseline, (box[0] + box[2]) / 2 - linebox[0]) + linebox[3] 327 | box = [float(i) for i in box] 328 | 329 | # Transform angle to x,y co-ords needed for proper text placement 330 | # We only support 0, 90, 180, 270!. Anything else, we'll just use the normal orientation for now 331 | 332 | coords = { 0: (box[0], box[1]), 333 | 90: (box[0], box[3]), # facing right 334 | 180: (box[2], box[3]), # upside down 335 | 270: (box[2], box[1]), # facing left 336 | } 337 | x,y = coords.get(textangle, (box[0], box[1])) 338 | 339 | style = getSampleStyleSheet() 340 | normal = style["BodyText"] 341 | normal.alignment = TA_LEFT 342 | normal.leading = 0 343 | font_name, font_size = self._get_font_spec(word.attrib['title']) 344 | normal.fontName = "Helvetica" 345 | normal.fontSize = font_size 346 | 347 | para = RotatedPara(escape(word.text.strip()), normal, textangle) 348 | para.wrapOn(pdf, para.minWidth(), 100) # Not sure what to use as the height here 349 | para.drawOn(pdf, x*72/dpi, height - y*72/dpi) 350 | 351 | 352 | 353 | def polyval(self,poly, x): 354 | return x * poly[0] + poly[1] 355 | 356 | 357 | def _get_font_spec(self, tag): 358 | try: 359 | fontspec = self.regex_fontspec.search(tag).groups() 360 | fontname, fontsize = fontspec 361 | except Exception: 362 | fontname = "" 363 | fontsize = "8" 364 | return (fontname, self._atoi(fontsize)) 365 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_pdffiler.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ 17 | Provides capability to search PDFs and file to a specific folder based 18 | on keywords 19 | """ 20 | 21 | from sets import Set 22 | import sys, os 23 | import re 24 | import logging 25 | import shutil 26 | 27 | from PyPDF2 import PdfFileReader 28 | from pypdfocr_filer import PyFiler 29 | from pypdfocr_filer_dirs import PyFilerDirs 30 | 31 | class PyPdfFiler(object): 32 | def __init__(self, filer): 33 | 34 | assert isinstance(filer, PyFiler) 35 | self.filer = filer # Must be a subclass of PyFiler 36 | 37 | # Whether to fall back on filename for matching keywords against 38 | # if there is no match in the text 39 | self.file_using_filename = False 40 | 41 | def iter_pdf_page_text(self, filename): 42 | self.filename = filename 43 | reader = PdfFileReader(filename) 44 | logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) 45 | for pgnum in range(reader.getNumPages()): 46 | text = reader.getPage(pgnum).extractText() 47 | text = text.encode('ascii', 'ignore') 48 | text = text.replace('\n', ' ') 49 | yield text 50 | 51 | def _get_matching_folder(self, pdfText): 52 | searchText = pdfText.lower() 53 | for folder,strings in self.filer.folder_targets.items(): 54 | for s in strings: 55 | logging.debug("Checking string %s" % s) 56 | if s in searchText: 57 | logging.info("Matched keyword '%s'" % s) 58 | return folder 59 | # No match found, so return 60 | return None 61 | 62 | def file_original (self, original_filename): 63 | return self.filer.file_original(original_filename) 64 | 65 | def move_to_matching_folder(self, filename): 66 | for page_text in self.iter_pdf_page_text(filename): 67 | tgt_folder = self._get_matching_folder(page_text) 68 | if tgt_folder: break # Stop searching through pdf pages as soon as we find a match 69 | 70 | if not tgt_folder and self.file_using_filename: 71 | tgt_folder = self._get_matching_folder(filename) 72 | 73 | tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder) 74 | return tgt_file 75 | 76 | if __name__ == '__main__': 77 | p = PyPdfFiler(PyFilerDirs()) 78 | for page_text in p.iter_pdf_page_text("scan_ocr.pdf"): 79 | print (page_text) 80 | 81 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | # Copyright 2013 Virantha Ekanayake All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | 19 | """ 20 | Wrap ImageMagick calls. Yes, this is ugly. 21 | """ 22 | 23 | import subprocess 24 | import sys, os 25 | import logging 26 | import glob 27 | import functools 28 | import signal 29 | 30 | from multiprocessing import Pool 31 | from pypdfocr_interrupts import init_worker 32 | 33 | # Ugly hack to pass in object method to the multiprocessing library 34 | # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90 35 | # Basically gets passed in a pair of (self, arg), and calls the method 36 | def unwrap_self(arg, **kwarg): 37 | return PyPreprocess._run_preprocess(*arg, **kwarg) 38 | 39 | 40 | 41 | class PyPreprocess(object): 42 | """Class to wrap all the ImageMagick convert calls""" 43 | def __init__(self, config): 44 | self.msgs = { 45 | 'CV_FAILED': 'convert execution failed', 46 | } 47 | self.threads = config.get('threads', 4) 48 | 49 | def _warn(self, msg): # pragma: no cover 50 | print("WARNING: %s" % msg) 51 | 52 | def cmd(self, cmd_list): 53 | if isinstance(cmd_list, list): 54 | cmd_list = ' '.join(cmd_list) 55 | logging.debug("Running cmd: %s" % cmd_list) 56 | try: 57 | out = subprocess.check_output(cmd_list, stderr=subprocess.STDOUT, shell=True) 58 | logging.debug(out) 59 | return out 60 | except subprocess.CalledProcessError as e: 61 | print e.output 62 | self._warn("Could not run command %s" % cmd_list) 63 | 64 | 65 | def _run_preprocess(self, in_filename): 66 | basename, filext = os.path.splitext(in_filename) 67 | out_filename = '%s_preprocess%s' % (basename, filext) 68 | #-respect-parenthesis \( -clone 0 -colorspace gray -negate -lat 15x5+5% -contrast-stretch 0 \) -compose copy_opacity -composite -opaque none +matte -modulate 100,50 -adaptive-blur 2.0 -sharpen 0x1 69 | # When using Windows, can't use backslash parenthesis in the shell, so omit the backslash 70 | if str(os.name) == 'nt': 71 | backslash = '' 72 | else: 73 | backslash = '\\' 74 | 75 | c = ['convert', 76 | '"%s"' % in_filename, 77 | '-respect-parenthesis', 78 | #'\\( $setcspace -colorspace gray -type grayscale \\)', 79 | backslash+'(', 80 | '-clone 0', 81 | '-colorspace gray -negate -lat 15x15+5% -contrast-stretch 0', 82 | backslash+') -compose copy_opacity -composite -opaque none +matte -modulate 100,100', 83 | #'-adaptive-blur 1.0', 84 | '-blur 1x1', 85 | #'-selective-blur 4x4+5%', 86 | '-adaptive-sharpen 0x2', 87 | '-negate -define morphology:compose=darken -morphology Thinning Rectangle:1x30+0+0 -negate ', # Removes vertical lines >=60 pixes, reduces widht of >30 (oherwise tesseract < 3.03 completely ignores text close to vertical lines in a table) 88 | '"%s"' % (out_filename) 89 | ] 90 | logging.info("Preprocessing image %s for better OCR" % in_filename) 91 | res = self.cmd(c) 92 | if res is None: 93 | return in_filename 94 | else: 95 | return out_filename 96 | 97 | def preprocess(self, in_filenames): 98 | fns = in_filenames 99 | 100 | pool = Pool(processes=self.threads, initializer=init_worker) 101 | try: 102 | logging.info("Starting preprocessing parallel execution") 103 | preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns)) 104 | pool.close() 105 | except KeyboardInterrupt or Exception: 106 | print("Caught keyboard interrupt... terminating") 107 | pool.terminate() 108 | #sys,exit(-1) 109 | raise 110 | finally: 111 | pool.join() 112 | logging.info ("Completed preprocessing") 113 | 114 | return preprocessed_filenames 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_tesseract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | # Copyright 2013 Virantha Ekanayake All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | """ 19 | Run Tesseract to generate hocr file 20 | """ 21 | 22 | import os, sys 23 | import logging 24 | import subprocess 25 | import glob 26 | from subprocess import CalledProcessError 27 | 28 | from multiprocessing import Pool 29 | from pypdfocr_interrupts import init_worker 30 | 31 | def error(text): 32 | print("ERROR: %s" % text) 33 | sys.exit(-1) 34 | 35 | # Ugly hack to pass in object method to the multiprocessing library 36 | # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90 37 | # Basically gets passed in a pair of (self, arg), and calls the method 38 | def unwrap_self(arg, **kwarg): 39 | return PyTesseract.make_hocr_from_pnm(*arg, **kwarg) 40 | 41 | class PyTesseract(object): 42 | """Class to wrap all the tesseract calls""" 43 | def __init__(self, config): 44 | """ 45 | Detect windows tesseract location. 46 | """ 47 | self.lang = 'eng' 48 | self.required = "3.02.02" 49 | self.threads = config.get('threads',4) 50 | 51 | if "binary" in config: # Override location of binary 52 | binary = config['binary'] 53 | if os.name == 'nt': 54 | binary = '"%s"' % binary 55 | binary = binary.replace("\\", "\\\\") 56 | logging.info("Setting location for tesseracdt executable to %s" % (binary)) 57 | else: 58 | if str(os.name) == 'nt': 59 | # Explicit str here to get around some MagicMock stuff for testing that I don't quite understand 60 | binary = '"c:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"' 61 | else: 62 | binary = "tesseract" 63 | 64 | self.binary = binary 65 | 66 | self.msgs = { 67 | 'TS_MISSING': """ 68 | Could not execute %s 69 | Please make sure you have Tesseract installed correctly 70 | """ % self.binary, 71 | 'TS_VERSION':'Tesseract version is too old', 72 | 'TS_img_MISSING':'Cannot find specified tiff file', 73 | 'TS_FAILED': 'Tesseract-OCR execution failed!', 74 | } 75 | 76 | 77 | def _is_version_uptodate(self): 78 | """ 79 | Make sure the version is current 80 | """ 81 | logging.info("Checking tesseract version") 82 | cmd = '%s -v' % (self.binary) 83 | logging.info(cmd) 84 | try: 85 | ret_output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) 86 | except CalledProcessError: 87 | # Could not run tesseract 88 | error(self.msgs['TS_MISSING']) 89 | 90 | ver_str = '0.0.0' 91 | for line in ret_output.splitlines(): 92 | if 'tesseract' in line: 93 | ver_str = line.split(' ')[1] 94 | if ver_str.endswith('dev'): # Fix for version strings that end in 'dev' 95 | ver_str = ver_str[:-3] 96 | 97 | # Iterate through the version dots 98 | ver = [int(x) for x in ver_str.split('.')] 99 | req = [int(x) for x in self.required.split('.')] 100 | 101 | # Aargh, in windows 3.02.02 is reported as version 3.02 102 | # SFKM 103 | if str(os.name) == 'nt': 104 | req = req[:2] 105 | 106 | version_good = False 107 | for i,num in enumerate(req): 108 | if len(ver) < i+1: 109 | # This minor version number is not present in tesseract, so it must be 110 | # lower than required. (3.02 < 3.02.01) 111 | break 112 | if ver[i]==num and len(ver) == i+1 and len(ver)==len(req): 113 | # 3.02.02 == 3.02.02 114 | version_good = True 115 | continue 116 | if ver[i]>num: 117 | # 4.0 > 3.02.02 118 | # 3.03.02 > 3.02.02 119 | version_good = True 120 | break 121 | if ver[i] 0: 33 | try: 34 | val = self.func() 35 | tries = 0 36 | except Exception as e: 37 | logging.exception("intermediate failure") 38 | logging.info("Retrying (tries left %d)" % (tries-1)) 39 | time.sleep(self.pause) 40 | tries -= 1 41 | if tries == 0: 42 | raise e 43 | 44 | return val 45 | 46 | 47 | 48 | class ExecutableSearcher(object): 49 | 50 | pass 51 | 52 | 53 | class WindowsExecutableSearcher(ExecutableSearcher): 54 | 55 | def __init__(self, possible_dir_names, possible_exe_names): 56 | """ 57 | 58 | """ 59 | if not exe_name.endswith('exe'): 60 | self.exe_name = exe_name+'.exe' 61 | else: 62 | self.exe_name = exe_name 63 | 64 | def find(self, root): 65 | """ 66 | Search below root for the given executable 67 | """ 68 | found_exe = self.exe_name 69 | 70 | if os.path.exists(root): 71 | cwd = os.getcwd() 72 | os.chdir(root) 73 | for root, dirs, files in os.walk('.', topdown=True): 74 | pass 75 | 76 | return found_exe 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /pypdfocr/pypdfocr_watcher.py: -------------------------------------------------------------------------------- 1 | """ 2 | Something 3 | """ 4 | 5 | import sys, os 6 | import re 7 | import logging 8 | import shutil 9 | import time 10 | import glob 11 | 12 | from threading import Lock 13 | 14 | from watchdog.observers import Observer 15 | from watchdog.events import LoggingEventHandler 16 | from watchdog.events import FileSystemEventHandler 17 | 18 | 19 | class PyPdfWatcher(FileSystemEventHandler): 20 | """ 21 | Watch a folder for new pdf files. 22 | 23 | If new file event, then add it to queue with timestamp. 24 | If file mofified event, then change timestamp in queue. 25 | Every few seconds pop-off queue and if timestamp older than 3 seconds, 26 | process the file else, push it back onto queue. 27 | """ 28 | events = {} 29 | events_lock = Lock() 30 | 31 | def __init__(self, monitor_dir, config): 32 | FileSystemEventHandler.__init__(self) 33 | 34 | self.monitor_dir = monitor_dir 35 | if not config: config = {} 36 | 37 | self.scan_interval = config.get('scan_interval', 3) # If no updates in 3 seconds (or user specified option in config file) process file 38 | 39 | def start(self): 40 | self.observer = Observer() 41 | self.observer.schedule(self, self.monitor_dir) 42 | self.observer.start() 43 | print("Starting to watch for new pdfs in %s" % (self.monitor_dir)) 44 | while True: 45 | logging.info("Sleeping for %d seconds" % self.scan_interval) 46 | time.sleep(self.scan_interval) 47 | newFile = self.check_queue() 48 | if newFile: 49 | yield newFile 50 | self.observer.join() 51 | 52 | 53 | def stop(self): 54 | self.observer.stop() 55 | 56 | def rename_file_with_spaces(self, pdf_filename): 57 | """ 58 | Rename any portion of a filename that has spaces in the basename with underscores. 59 | Does not affect spaces in the directory path. 60 | 61 | :param pdf_filename: Filename to remove spaces 62 | :type pdf_filename: string 63 | :returns: Modified filename 64 | :rtype: string 65 | """ 66 | filepath, filename = os.path.split(pdf_filename) 67 | if ' ' in filename: 68 | newFilename = os.path.join(filepath, filename.replace(' ','_')) 69 | logging.debug("Renaming spaces") 70 | logging.debug("---> %s \n ------> %s" % (pdf_filename, newFilename)) 71 | shutil.move(pdf_filename, newFilename) 72 | return newFilename 73 | else: 74 | return pdf_filename 75 | 76 | def check_for_new_pdf(self,ev_path): 77 | """ 78 | Called by the file watching api on any file creations/modifications. 79 | For any file ending with ".pdf", but not "_ocr.pdf", it adds new files 80 | to the event queue with the current time stamp, or it updates existing files in 81 | the queue with the current timestamp. This queue is used to track files and 82 | keep track of their last "touched" time, so we can start processing a file if 83 | :func:`check_queue` finds a file that hasn't been touched in a while. 84 | 85 | If the file does note exist in the events dict: 86 | 87 | - Add it with the current time 88 | 89 | Otherwise: 90 | 91 | - If the file time is marked as -1, delete it from the dict 92 | - Else, update the time in the dict to the current time 93 | 94 | """ 95 | if ev_path.endswith(".pdf"): 96 | if not ev_path.endswith(("_ocr.pdf", "_test.pdf")): 97 | PyPdfWatcher.events_lock.acquire() 98 | if not ev_path in PyPdfWatcher.events: 99 | PyPdfWatcher.events[ev_path] = time.time() 100 | logging.info ("Adding %s to event queue" % ev_path) 101 | else: 102 | if PyPdfWatcher.events[ev_path] == -1: 103 | logging.info ( "%s removing from event queue" % (ev_path)) 104 | del PyPdfWatcher.events[ev_path] 105 | else: 106 | newTime = time.time() 107 | logging.debug ( "%s already in event queue, updating timestamp to %d" % (ev_path, newTime)) 108 | PyPdfWatcher.events[ev_path] = newTime 109 | PyPdfWatcher.events_lock.release() 110 | 111 | 112 | 113 | def on_created(self, event): 114 | logging.debug ("on_created: %s at time %d" % (event.src_path, time.time())) 115 | self.check_for_new_pdf(event.src_path) 116 | 117 | def on_moved(self, event): 118 | logging.debug ("on_moved: %s" % event.src_path) 119 | self.check_for_new_pdf(event.dest_path) 120 | 121 | def on_modified(self, event): 122 | logging.debug ("on_modified: %s" % event.src_path) 123 | self.check_for_new_pdf(event.src_path) 124 | 125 | def check_queue(self): 126 | """ 127 | This function is called at regular intervals by :func:`start`. 128 | 129 | Iterate through the events, and if there is any with a timestamp 130 | greater than the scan_interval, return it and set its timestamp to -1 131 | for purging later. 132 | 133 | :returns: Filename if available to process, otherwise None. 134 | """ 135 | now = time.time() 136 | PyPdfWatcher.events_lock.acquire() 137 | for monitored_file, timestamp in PyPdfWatcher.events.items(): 138 | if timestamp == -1: 139 | del PyPdfWatcher.events[monitored_file] 140 | elif now - timestamp > self.scan_interval: 141 | logging.info("Processing new file %s" % (monitored_file)) 142 | # Remove this file from the dict 143 | del PyPdfWatcher.events[monitored_file] 144 | monitored_file = self.rename_file_with_spaces(monitored_file) 145 | PyPdfWatcher.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler 146 | PyPdfWatcher.events_lock.release() 147 | return monitored_file 148 | PyPdfWatcher.events_lock.release() 149 | return None 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /pypdfocr/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.9.1" 2 | -------------------------------------------------------------------------------- /pypdfocr_windows.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python -*- 2 | a = Analysis(['pypdfocr\\pypdfocr.py'], 3 | pathex=['C:\\Users\\Virantha Ekanayake\\dev\\pypdfocr'], 4 | hiddenimports = [ 5 | 'reportlab.pdfbase._fontdata_enc_macexpert', 6 | 'reportlab.pdfbase._fontdata_enc_macroman', 7 | 'reportlab.pdfbase._fontdata_enc_pdfdoc', 8 | 'reportlab.pdfbase._fontdata_enc_standard', 9 | 'reportlab.pdfbase._fontdata_enc_symbol', 10 | 'reportlab.pdfbase._fontdata_enc_winansi', 11 | 'reportlab.pdfbase._fontdata_enc_zapfdingbats', 12 | 'reportlab.pdfbase._fontdata_widths_courier', 13 | 'reportlab.pdfbase._fontdata_widths_courierbold', 14 | 'reportlab.pdfbase._fontdata_widths_courierboldoblique', 15 | 'reportlab.pdfbase._fontdata_widths_courieroblique', 16 | 'reportlab.pdfbase._fontdata_widths_helvetica', 17 | 'reportlab.pdfbase._fontdata_widths_helveticabold', 18 | 'reportlab.pdfbase._fontdata_widths_helveticaboldoblique', 19 | 'reportlab.pdfbase._fontdata_widths_helveticaoblique', 20 | 'reportlab.pdfbase._fontdata_widths_symbol', 21 | 'reportlab.pdfbase._fontdata_widths_timesbold', 22 | 'reportlab.pdfbase._fontdata_widths_timesbolditalic', 23 | 'reportlab.pdfbase._fontdata_widths_timesitalic', 24 | 'reportlab.pdfbase._fontdata_widths_timesroman', 25 | 'reportlab.pdfbase._fontdata_widths_zapfdingbats', 26 | 'reportlab.rl_settings'], 27 | hookspath=None, 28 | runtime_hooks=None) 29 | pyz = PYZ(a.pure) 30 | exe = EXE(pyz, 31 | a.scripts, 32 | a.binaries, 33 | a.zipfiles, 34 | a.datas, 35 | name='pypdfocr.exe', 36 | debug=False, 37 | strip=None, 38 | upx=True, 39 | console=True ) 40 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pillow>=2.2 2 | reportlab>=2.7 3 | watchdog>=0.6.0 4 | pypdf2>=1.23 5 | evernote 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from setuptools import setup, find_packages 3 | 4 | import pypdfocr 5 | import io 6 | from pypdfocr.version import __version__ 7 | from setuptools import Command 8 | import os 9 | 10 | class PyTest(Command): 11 | user_options = [] 12 | def initialize_options(self): 13 | pass 14 | def finalize_options(self): 15 | pass 16 | def run(self): 17 | import sys,subprocess 18 | cwd = os.getcwd() 19 | os.chdir('test') 20 | errno = subprocess.call([sys.executable, 'runtests.py']) 21 | os.chdir(cwd) 22 | raise SystemExit(errno) 23 | 24 | def read(*filenames, **kwargs): 25 | encoding = kwargs.get('encoding', 'utf-8') 26 | sep = kwargs.get('sep', '\n') 27 | buf = [] 28 | for filename in filenames: 29 | with io.open(filename, encoding=encoding) as f: 30 | buf.append(f.read()) 31 | return sep.join(buf) 32 | 33 | packages = find_packages(exclude="tests") 34 | 35 | long_description = read('README.rst', 'CHANGES.rst', 'TODO.rst') 36 | 37 | with open("requirements.txt") as f: 38 | required = f.read().splitlines() 39 | 40 | setup ( 41 | name = "pypdfocr", 42 | version = __version__, 43 | description="Converts a scanned PDF into an OCR'ed pdf using Tesseract-OCR and Ghostscript", 44 | license = "ASL 2.0", 45 | long_description = long_description, 46 | author="Virantha N. Ekanayake", 47 | author_email="virantha@gmail.com", # Removed. 48 | package_data = {'': ['*.xml']}, 49 | zip_safe = True, 50 | include_package_data = True, 51 | packages = packages, 52 | install_requires = required, 53 | entry_points = { 54 | 'console_scripts': [ 55 | 'pypdfocr = pypdfocr.pypdfocr:main' 56 | ], 57 | }, 58 | options = { 59 | "pyinstaller": {"packages": packages} 60 | }, 61 | cmdclass = {'test':PyTest} 62 | 63 | ) 64 | -------------------------------------------------------------------------------- /test/pdfs/1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/1.pdf -------------------------------------------------------------------------------- /test/pdfs/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test.pdf -------------------------------------------------------------------------------- /test/pdfs/test_cinderella.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_cinderella.pdf -------------------------------------------------------------------------------- /test/pdfs/test_patent.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_patent.pdf -------------------------------------------------------------------------------- /test/pdfs/test_recipe.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_recipe.pdf -------------------------------------------------------------------------------- /test/pdfs/test_recipe_sideways.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_recipe_sideways.pdf -------------------------------------------------------------------------------- /test/pdfs/test_sherlock.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_sherlock.pdf -------------------------------------------------------------------------------- /test/pdfs/test_super_long_keyword.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_super_long_keyword.pdf -------------------------------------------------------------------------------- /test/temp/original/test_patent.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_patent.pdf -------------------------------------------------------------------------------- /test/temp/original/test_patent_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_patent_1.pdf -------------------------------------------------------------------------------- /test/temp/original/test_recipe.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_recipe.pdf -------------------------------------------------------------------------------- /test/temp/original/test_recipe_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_recipe_1.pdf -------------------------------------------------------------------------------- /test/temp/original/test_sherlock.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_sherlock.pdf -------------------------------------------------------------------------------- /test/temp/original/test_sherlock_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_sherlock_1.pdf -------------------------------------------------------------------------------- /test/test_evernote.py: -------------------------------------------------------------------------------- 1 | #from pypdfocr import PyPDFOCR as P 2 | import pypdfocr.pypdfocr_filer_evernote as P 3 | import pytest 4 | import os 5 | 6 | import evernote.api.client 7 | import evernote.edam.type.ttypes as Types 8 | import hashlib 9 | 10 | from mock import patch, call 11 | 12 | class TestEvernote: 13 | 14 | def test_connecct(self): 15 | # Tricky mocking. Need to mock the EvernoteClient import in pypdfocr_filer_evernote.py file 16 | with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: 17 | p = P.PyFilerEvernote("TOKEN") 18 | inst = mock_evernote_client.return_value 19 | assert(inst.get_user_store.called) 20 | 21 | @patch('shutil.move') 22 | def test_file_original(self, mock_move): 23 | with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: 24 | p = P.PyFilerEvernote("TOKEN") 25 | filename = os.path.join("pdfs","test_recipe.pdf") 26 | 27 | # First, test code that does not move original 28 | p.file_original(filename) 29 | assert (not mock_move.called) 30 | 31 | # Now test moving 32 | p.set_original_move_folder(os.path.join("temp", "original")) 33 | p.file_original(filename) 34 | mock_move.assert_called_with(filename, os.path.join("temp","original", "test_recipe_2.pdf")) 35 | 36 | @patch('os.remove') 37 | def test_move_to_folder(self, mock_remove): 38 | with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: 39 | p = P.PyFilerEvernote("TOKEN") 40 | filename = os.path.join("pdfs", "test_recipe.pdf") 41 | foldername = 'recipe' 42 | with pytest.raises(AssertionError): 43 | p.move_to_matching_folder(filename, foldername) 44 | p.set_target_folder('target') 45 | with pytest.raises(AssertionError): 46 | p.move_to_matching_folder(filename, foldername) 47 | p.set_default_folder('default') 48 | p.move_to_matching_folder(filename, None) 49 | p.move_to_matching_folder(filename, foldername) 50 | 51 | mock_client = mock_evernote_client.return_value 52 | assert(mock_client.get_note_store.called) 53 | assert(mock_client.get_note_store.return_value.createNote.called) 54 | mock_remove.assert_called_with(filename) 55 | 56 | 57 | 58 | 59 | def test_create_note(self): 60 | with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: 61 | p = P.PyFilerEvernote("TOKEN") 62 | notebook = Types.Notebook() 63 | notebook.name = "recipe" 64 | filename = "pdfs/test_recipe.pdf" 65 | note = p._create_evernote_note(notebook, filename) 66 | xml = '' 67 | assert(note.content.startswith(xml)) 68 | 69 | md5 = hashlib.md5() 70 | with open(filename,'rb') as f: 71 | pdf_bytes = f.read() 72 | md5.update(pdf_bytes) 73 | 74 | md5hash = md5.hexdigest() 75 | 76 | assert(md5hash in note.content) 77 | assert(note.resources[0].data.bodyHash == md5hash) 78 | 79 | 80 | def test_check_notebook(self): 81 | with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: 82 | p = P.PyFilerEvernote("TOKEN") 83 | p._check_and_make_notebook("new_notebook") 84 | # Let's assert that we tried to create a new notebook 85 | mock_client = mock_evernote_client.return_value 86 | assert(mock_client.get_note_store.called) 87 | create_func = mock_client.get_note_store.return_value.createNotebook 88 | update_func = mock_client.get_note_store.return_value.updateNotebook 89 | assert(create_func.called) 90 | assert(not update_func.called) 91 | notebook = create_func.call_args[0][0] 92 | assert(notebook.name == 'new_notebook') 93 | 94 | # Now, let's setup a value for the notebooks, so we test the code for 95 | # a "pre-exisiting" notebook 96 | test_notebook = Types.Notebook() 97 | test_notebook.name = "new_notebook" 98 | mock_client.get_note_store.return_value.listNotebooks.return_value = [test_notebook] 99 | p._check_and_make_notebook("new_notebook") 100 | 101 | # Now check that the code to update a notebook stack is correct 102 | test_notebook.stack = "new_stack" 103 | update_func = mock_client.get_note_store.return_value.updateNotebook 104 | p.set_target_folder("Boogie") 105 | p._check_and_make_notebook("new_notebook") 106 | # Check that the update call was called with correct arguments 107 | assert(update_func.called) 108 | notebook = update_func.call_args[0][0] 109 | assert(notebook.stack == 'Boogie') 110 | 111 | 112 | def test_add_folder_target(self): 113 | with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client: 114 | p = P.PyFilerEvernote("TOKEN") 115 | p.add_folder_target("folder1", ["target1", "target2"]) 116 | with pytest.raises(AssertionError): 117 | p.add_folder_target("folder1", ["target1", "target2"]) 118 | p.add_folder_target("folder2", ["target1", "target2"]) 119 | assert("folder1" in p.folder_targets.keys()) 120 | assert("folder2" in p.folder_targets.keys()) 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /test/test_gs.py: -------------------------------------------------------------------------------- 1 | #from pypdfocr import PyPDFOCR as P 2 | import pypdfocr.pypdfocr_gs as P 3 | import pytest 4 | import os 5 | 6 | import hashlib 7 | 8 | from mock import patch, call 9 | from pytest import skip 10 | 11 | class TestGS: 12 | 13 | @pytest.mark.skipif(os.name!='nt', reason="Not on NT") 14 | @patch('os.name') 15 | @patch('subprocess.check_output') 16 | def test_gs_set_nt(self, mock_subprocess, mock_os_name): 17 | """ 18 | Check that we have a exe on windows 19 | """ 20 | mock_os_name.__str__.return_value = 'nt' 21 | p = P.PyGs({}) 22 | 23 | assert 'gswin' in p.binary 24 | 25 | @pytest.mark.skipif(os.name!='nt', reason="Not on NT") 26 | @patch('os.name') 27 | @patch('subprocess.call') 28 | def test_gs_run_nt(self, mock_subprocess, mock_os_name, capsys): 29 | """ 30 | Stupid test because Windows Tesseract only returns 3.02 instead of 3.02.02 31 | """ 32 | mock_os_name.__str__.return_value = 'nt' 33 | p = P.PyGs({}) 34 | 35 | mock_subprocess.return_value = -1 36 | p.binary = 'gsblah.exe' 37 | with pytest.raises(SystemExit): 38 | p._run_gs("","","") 39 | 40 | out,err = capsys.readouterr() 41 | assert p.msgs['GS_FAILED'] in out 42 | 43 | def test_gs_pdf_missing(self, capsys): 44 | p = P.PyGs({}) 45 | with pytest.raises(SystemExit): 46 | p.make_img_from_pdf("missing123.pdf") 47 | out,err = capsys.readouterr() 48 | assert p.msgs['GS_MISSING_PDF'] in out 49 | 50 | 51 | -------------------------------------------------------------------------------- /test/test_option_config.yaml: -------------------------------------------------------------------------------- 1 | target_folder: "blah" 2 | -------------------------------------------------------------------------------- /test/test_option_parsing.py: -------------------------------------------------------------------------------- 1 | #from pypdfocr import PyPDFOCR as P 2 | import pypdfocr.pypdfocr as P 3 | import pytest 4 | 5 | 6 | class TestOptions: 7 | 8 | def setup(self): 9 | self.p = P.PyPDFOCR() 10 | 11 | 12 | def test_standalone(self): 13 | opts = ["blah.pdf"] 14 | self.p.get_options(opts) 15 | 16 | opts.append('-d') 17 | self.p.get_options(opts) 18 | assert(self.p.debug) 19 | 20 | opts.append('-v') 21 | self.p.get_options(opts) 22 | assert(self.p.verbose) 23 | 24 | opts.append('--preprocess') 25 | self.p.get_options(opts) 26 | assert(not self.p.skip_preprocess) 27 | 28 | assert(not self.p.enable_filing) 29 | assert(self.p.config == {}) 30 | 31 | def test_standalone_filing(self): 32 | opts = ["blah.pdf"] 33 | opts.append('-f') 34 | 35 | # Assert that filing option requires a config file 36 | with pytest.raises(SystemExit): 37 | self.p.get_options(opts) 38 | 39 | # Assert that it checks that the config file is present 40 | opts.append('--config=test_option_config.yaml') 41 | self.p.get_options(opts) 42 | assert(self.p.enable_filing) 43 | assert(self.p.config) 44 | 45 | def test_standalone_filing_evernote(self): 46 | # Check when evernote is enabled 47 | opts = ["blah.pdf"] 48 | opts.append('-e') 49 | # Assert that it checks that the config file is present 50 | with pytest.raises(SystemExit): 51 | self.p.get_options(opts) 52 | 53 | opts.append('--config=test_option_config.yaml') 54 | self.p.get_options(opts) 55 | # Enabling -e should turn on filing too 56 | assert(self.p.enable_filing) 57 | assert(self.p.enable_evernote) 58 | assert(self.p.config) 59 | assert(not self.p.watch) 60 | 61 | opts.append('-f') 62 | self.p.get_options(opts) 63 | assert(self.p.enable_filing) 64 | assert(self.p.enable_evernote) 65 | assert(self.p.config) 66 | assert(not self.p.watch) 67 | 68 | def test_standalone_watch_conflict(self): 69 | # When pdf file is specified, we don't want to allow watch option 70 | opts = ["blah.pdf", '-w'] 71 | with pytest.raises(SystemExit): 72 | self.p.get_options(opts) 73 | 74 | def test_watch_filing(self): 75 | opts = ['-w'] 76 | # Catch watch without a dir 77 | with pytest.raises(SystemExit): 78 | self.p.get_options(opts) 79 | 80 | opts = ['-w temp'] 81 | self.p.get_options(opts) 82 | assert(self.p.watch_dir) 83 | 84 | opts.append('--config=test_option_config.yaml') 85 | self.p.get_options(opts) 86 | assert(self.p.watch) 87 | assert(self.p.config) 88 | assert(not self.p.enable_filing) 89 | assert(not self.p.enable_evernote) 90 | 91 | def test_watch_filing_evernote(self): 92 | opts = ['-w temp', '-e', '--config=test_option_config.yaml'] 93 | self.p.get_options(opts) 94 | assert(self.p.watch) 95 | assert(self.p.config) 96 | assert(self.p.enable_filing) 97 | assert(self.p.enable_evernote) 98 | 99 | opts = ['-w temp', '-f', '-e', '--config=test_option_config.yaml'] 100 | self.p.get_options(opts) 101 | assert(self.p.watch) 102 | assert(self.p.config) 103 | assert(self.p.enable_filing) 104 | assert(self.p.enable_evernote) 105 | 106 | -------------------------------------------------------------------------------- /test/test_pdf_filer.py: -------------------------------------------------------------------------------- 1 | #from pypdfocr import PyPDFOCR as P 2 | import pypdfocr.pypdfocr as P 3 | import pytest 4 | import os 5 | 6 | import hashlib 7 | 8 | from mock import patch, call 9 | from pytest import skip 10 | 11 | class TestPDFFiler: 12 | 13 | @patch('shutil.move') 14 | def test_file_by_filename(self, mock_move): 15 | """ 16 | Test filing of single pdf based on filename. 17 | """ 18 | 19 | # Mock the move function so we don't actually end up filing 20 | p = P.PyPDFOCR() 21 | cwd = os.getcwd() 22 | filename = os.path.join("pdfs", "test_super_long_keyword.pdf") 23 | out_filename = filename.replace(".pdf", "_ocr.pdf") 24 | 25 | if os.path.exists(out_filename): 26 | os.remove(out_filename) 27 | 28 | print("Current directory: %s" % os.getcwd()) 29 | #opts = [filename, "--config=test_pypdfocr_config.yaml", "-f"] 30 | opts = [filename, "--config=test_pypdfocr_config_filename.yaml", "-f", "-n"] 31 | p.go(opts) 32 | 33 | assert(os.path.exists(out_filename)) 34 | os.remove(out_filename) 35 | 36 | calls = [call(out_filename, os.path.abspath(os.path.join('temp', 'target','recipe', os.path.basename(out_filename))))] 37 | mock_move.assert_has_calls(calls) 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /test/test_pypdfocr.py: -------------------------------------------------------------------------------- 1 | #from pypdfocr import PyPDFOCR as P 2 | import pypdfocr.pypdfocr as P 3 | import pytest 4 | import os 5 | import logging 6 | 7 | from PyPDF2 import PdfFileReader 8 | import smtplib 9 | from mock import Mock 10 | from mock import patch, call 11 | from mock import MagicMock 12 | from mock import PropertyMock 13 | 14 | 15 | class TestPydfocr: 16 | 17 | def setup(self): 18 | self.p = P.PyPDFOCR() 19 | 20 | def _iter_pdf(self, filename): 21 | with open(filename, 'rb') as f: 22 | reader = PdfFileReader(f) 23 | logging.debug("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) 24 | for pgnum in range(reader.getNumPages()): 25 | text = reader.getPage(pgnum).extractText() 26 | text = text.encode('ascii', 'ignore') 27 | text = text.replace('\n', ' ') 28 | yield text 29 | 30 | pdf_tests = [ 31 | (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe.pdf"), [ ["Simply Recipes"], 32 | ]), 33 | (".", os.path.join("temp","target","patents"), os.path.join("pdfs","test_patent.pdf"), [ 34 | ["asynchronous", "subject to", "20 Claims"], # Page 1 35 | ["FOREIGN PATENT" ], # Page 2 36 | ]), 37 | (".", os.path.join("temp","target", "default"), os.path.join("pdfs","test_sherlock.pdf"), [ ["Bohemia", "Trincomalee"], # Page 1 38 | ["hundreds of times" ], # Page 2 39 | ]), 40 | ("pdfs", os.path.join("temp","target","default"), "test_sherlock.pdf", [ ["Bohemia", "Trincomalee"], # Page 1 41 | ["hundreds of times" ], # Page 2 42 | ]), 43 | (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "1.pdf"), [ ["Simply","Recipes"], 44 | ]), 45 | (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe_sideways.pdf"), [ ["Simply","Recipes", 'spinach'], 46 | ]), 47 | ] 48 | 49 | #@pytest.mark.skipif(True, reason="Just testing") 50 | @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests) 51 | def test_standalone(self, dirname, tgt_folder, filename, expected): 52 | """ 53 | Test the single file conversion with no filing. 54 | Tests relative paths (".."), files in subirs, and files in current dir 55 | Checks for that _ocr file is created and keywords found in pdf. 56 | Modify :attribute:`pdf_tests` for changing keywords, etc 57 | 58 | :param expected: List of keywords lists per page. expected[0][1] is the second keyword to assert on page 1 59 | """ 60 | # Run a single file conversion 61 | 62 | # First redo the unix-style paths, in case we're running on windows 63 | # Assume paths in unix-style 64 | dirname = os.path.join(*(dirname.split("/"))) 65 | tgt_folder = os.path.join(*(tgt_folder.split("/"))) 66 | filename = os.path.join(*(filename.split("/"))) 67 | 68 | 69 | cwd = os.getcwd() 70 | os.chdir(dirname) 71 | opts = [filename, '--skip-preprocess'] 72 | self.p.go(opts) 73 | 74 | out_filename = filename.replace(".pdf", "_ocr.pdf") 75 | assert(os.path.exists(out_filename)) 76 | for i,t in enumerate(self._iter_pdf(out_filename)): 77 | if len(expected) > i: 78 | for keyword in expected[i]: 79 | assert(keyword in t) 80 | print ("\n----------------------\nPage %d\n" % i) 81 | print t 82 | os.remove(out_filename) 83 | os.chdir(cwd) 84 | 85 | #@pytest.mark.skipif(True, reason="just testing") 86 | @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", [pdf_tests[0]]) 87 | def test_standalone_email(self, dirname, tgt_folder, filename, expected): 88 | """ 89 | Get coverage on the email after conversion of a single file. 90 | Use mock to stub out the smtpllib 91 | """ 92 | # Run a single file conversion 93 | 94 | # Mock the smtplib to test the email functions 95 | with patch("smtplib.SMTP") as mock_smtp: 96 | cwd = os.getcwd() 97 | os.chdir(dirname) 98 | opts = [filename, "--preprocess", "--config=test_pypdfocr_config.yaml", "-m"] 99 | self.p.go(opts) 100 | 101 | out_filename = filename.replace(".pdf", "_ocr.pdf") 102 | assert(os.path.exists(out_filename)) 103 | for i,t in enumerate(self._iter_pdf(out_filename)): 104 | if len(expected) > i: 105 | for keyword in expected[i]: 106 | assert(keyword in t) 107 | print ("\n----------------------\nPage %d\n" % i) 108 | print t 109 | os.remove(out_filename) 110 | os.chdir(cwd) 111 | 112 | # Assert the smtp calls 113 | instance = mock_smtp.return_value 114 | assert(instance.starttls.called) 115 | instance.login.assert_called_once_with("someone@gmail.com", "blah") 116 | assert(instance.sendmail.called) 117 | 118 | @patch('shutil.move') 119 | @pytest.mark.parametrize("config", [("test_pypdfocr_config.yaml"), ("test_pypdfocr_config_no_move_original.yaml")]) 120 | @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests[0:3]) 121 | def test_standalone_filing(self, mock_move, config, dirname, tgt_folder, filename, expected): 122 | """ 123 | Test filing of single pdf. Also test moving of original file. 124 | 125 | Kind of hacked up right now, but it tries to test a lot of things (maybe too many) 126 | """ 127 | 128 | # Mock the move function so we don't actually end up filing 129 | cwd = os.getcwd() 130 | if os.path.exists("temp"): 131 | os.chdir("temp") 132 | for d in [os.path.join('target', 'patents'), os.path.join('target','recipe')]: 133 | if os.path.exists(d): 134 | os.removedirs(d) 135 | os.chdir(cwd) 136 | 137 | os.chdir(dirname) 138 | print("Current direcxtory: %s" % os.getcwd()) 139 | #opts = [filename, "--config=test_pypdfocr_config.yaml", "-f"] 140 | opts = [filename, '--skip-preprocess', "--config=%s" % config, "-f"] 141 | self.p.go(opts) 142 | 143 | out_filename = filename.replace(".pdf", "_ocr.pdf") 144 | assert(os.path.exists(out_filename)) 145 | for i,t in enumerate(self._iter_pdf(out_filename)): 146 | if len(expected) > i: 147 | for keyword in expected[i]: 148 | assert(keyword in t) 149 | print ("\n----------------------\nPage %d\n" % i) 150 | print t 151 | os.remove(out_filename) 152 | os.chdir(cwd) 153 | 154 | # Assert the smtp calls 155 | calls = [call(out_filename, 156 | os.path.abspath(os.path.join(tgt_folder,os.path.basename(out_filename))))] 157 | if not "no_move_original" in config: 158 | new_file_name = os.path.basename(filename).replace(".pdf", "_2.pdf") 159 | calls.append(call(filename, 160 | os.path.abspath(os.path.join("temp","original", new_file_name)))) 161 | mock_move.assert_has_calls(calls) 162 | 163 | def test_set_binaries(self): 164 | """ Test the setup_exteral_tools 165 | """ 166 | self.p.config = {} 167 | self.p.config["tesseract"] = {"binary":"/usr/bin/tesseract"} 168 | self.p.config["ghostscript"] = {"binary":"/usr/bin/ghostscript"} 169 | self.p._setup_external_tools() 170 | if not os.name == 'nt': 171 | assert(self.p.ts.binary == "/usr/bin/tesseract") 172 | assert(self.p.gs.binary == "/usr/bin/ghostscript") 173 | else: 174 | assert(self.p.ts.binary == '"/usr/bin/tesseract"') 175 | assert(self.p.gs.binary == '"/usr/bin/ghostscript"') 176 | 177 | 178 | -------------------------------------------------------------------------------- /test/test_pypdfocr_config.yaml: -------------------------------------------------------------------------------- 1 | target_folder: "temp/target" 2 | default_folder: "temp/target/default" 3 | original_move_folder: "temp/original" 4 | 5 | mail_smtp_server: "smtp.gmail.com:587" 6 | mail_smtp_login: "someone@gmail.com" 7 | mail_smtp_password: "blah" 8 | mail_from_addr: "someone#gmail.com" 9 | mail_to_list: 10 | - "someone@gmail.com" 11 | 12 | folders: 13 | recipe: 14 | - recipes 15 | patents: 16 | - patent 17 | - 2003 18 | 19 | -------------------------------------------------------------------------------- /test/test_pypdfocr_config_filename.yaml: -------------------------------------------------------------------------------- 1 | target_folder: "temp/target" 2 | default_folder: "temp/target/default" 3 | 4 | mail_smtp_server: "smtp.gmail.com:587" 5 | mail_smtp_login: "someone@gmail.com" 6 | mail_smtp_password: "blah" 7 | mail_from_addr: "someone#gmail.com" 8 | mail_to_list: 9 | - "someone@gmail.com" 10 | 11 | folders: 12 | recipe: 13 | - super_long_keyword 14 | 15 | -------------------------------------------------------------------------------- /test/test_pypdfocr_config_no_move_original.yaml: -------------------------------------------------------------------------------- 1 | target_folder: "temp/target" 2 | default_folder: "temp/target/default" 3 | 4 | mail_smtp_server: "smtp.gmail.com:587" 5 | mail_smtp_login: "someone@gmail.com" 6 | mail_smtp_password: "blah" 7 | mail_from_addr: "someone#gmail.com" 8 | mail_to_list: 9 | - "someone@gmail.com" 10 | 11 | folders: 12 | recipe: 13 | - recipes 14 | patents: 15 | - patent 16 | 17 | -------------------------------------------------------------------------------- /test/test_tesseract.py: -------------------------------------------------------------------------------- 1 | #from pypdfocr import PyPDFOCR as P 2 | import pypdfocr.pypdfocr_tesseract as P 3 | import pytest 4 | import os 5 | 6 | import hashlib 7 | 8 | from mock import patch, call 9 | 10 | class TestTesseract: 11 | 12 | @pytest.mark.skipif(os.name=='nt', reason='Does not work on Windows') 13 | def test_version_shorter_older(self): 14 | with patch("subprocess.check_output") as mock_subprocess: 15 | p = P.PyTesseract({}) 16 | p.required = "3.02.02" 17 | mock_subprocess.return_value = """tesseract 3.02""" 18 | uptodate,ver = p._is_version_uptodate() 19 | assert (not uptodate) 20 | 21 | def test_version_minor_older(self): 22 | with patch("subprocess.check_output") as mock_subprocess: 23 | p = P.PyTesseract({}) 24 | p.required = "3.02.02" 25 | mock_subprocess.return_value = """tesseract 3.02.01""" 26 | uptodate,ver = p._is_version_uptodate() 27 | assert (not uptodate) 28 | 29 | def test_version_major_older(self): 30 | with patch("subprocess.check_output") as mock_subprocess: 31 | p = P.PyTesseract({}) 32 | p.required = "3.02.02" 33 | mock_subprocess.return_value = """tesseract 2.03.03""" 34 | uptodate,ver = p._is_version_uptodate() 35 | assert (not uptodate) 36 | 37 | @pytest.mark.skipif(os.name=='nt', reason='Does not work on Windows') 38 | def test_version_major_equal(self): 39 | with patch("subprocess.check_output") as mock_subprocess: 40 | p = P.PyTesseract({}) 41 | p.required = "3.02.02" 42 | mock_subprocess.return_value = """tesseract 3.02.02""" 43 | uptodate,ver = p._is_version_uptodate() 44 | assert (uptodate) 45 | 46 | def test_version_major_newer(self): 47 | with patch("subprocess.check_output") as mock_subprocess: 48 | p = P.PyTesseract({}) 49 | p.required = "3.02.02" 50 | 51 | mock_subprocess.return_value = """tesseract 4.01""" 52 | uptodate,ver = p._is_version_uptodate() 53 | assert (uptodate) 54 | 55 | def test_version_minor_newer(self): 56 | with patch("subprocess.check_output") as mock_subprocess: 57 | p = P.PyTesseract({}) 58 | p.required = "3.01.02" 59 | 60 | mock_subprocess.return_value = """tesseract 3.02""" 61 | uptodate,ver = p._is_version_uptodate() 62 | assert (uptodate) 63 | 64 | 65 | def test_tesseract_presence(self, capsys): 66 | p = P.PyTesseract({}) 67 | p.binary = "tesserac" # Misspell it and make sure we get an error 68 | with pytest.raises(SystemExit): 69 | p._is_version_uptodate() 70 | out, err = capsys.readouterr() 71 | assert p.msgs['TS_MISSING'] in out 72 | 73 | def test_tesseract_version(self, capsys): 74 | p = P.PyTesseract({}) 75 | p.required = "100" 76 | with pytest.raises(SystemExit): 77 | p.make_hocr_from_pnms("") 78 | out, err = capsys.readouterr() 79 | assert p.msgs['TS_VERSION'] in out 80 | 81 | def test_tiff_file_check(self, capsys): 82 | p = P.PyTesseract({}) 83 | with pytest.raises(SystemExit): 84 | p.make_hocr_from_pnm("DUMMY_NOTPRESENT.tiff") 85 | out, err = capsys.readouterr() 86 | assert p.msgs['TS_img_MISSING'] in out 87 | 88 | @patch('os.name') 89 | @patch('subprocess.check_output') 90 | def test_tesseract_version_nt(self, mock_subprocess, mock_os_name): 91 | """ 92 | Stupid test because Windows Tesseract only returns 3.02 instead of 3.02.02 93 | """ 94 | mock_os_name.__str__.return_value = 'nt' 95 | p = P.PyTesseract({}) 96 | p.required = "3.02.02" 97 | 98 | mock_subprocess.return_value = """tesseract 3.02""" 99 | uptodate,ver = p._is_version_uptodate() 100 | assert (uptodate) 101 | 102 | @patch('pypdfocr.pypdfocr_tesseract.PyTesseract._is_version_uptodate') 103 | @patch('pypdfocr.pypdfocr_tesseract.os.name') 104 | @patch('pypdfocr.pypdfocr_tesseract.os.path.exists') 105 | def test_force_Nt(self, mock_os_path_exists, mock_os_name, mock_uptodate, capsys): 106 | mock_os_name.__str__.return_value = 'nt' 107 | p = P.PyTesseract({}) 108 | assert ('tesseract.exe' in p.binary) 109 | 110 | mock_os_path_exists.return_value = True 111 | mock_uptodate.return_value = (True,"") 112 | # force a bad tesseract on windows 113 | p.binary = "blah" 114 | print("here") 115 | with pytest.raises(SystemExit): 116 | p.make_hocr_from_pnm('blah.tiff') 117 | 118 | @patch('pypdfocr.pypdfocr_tesseract.subprocess.call') 119 | @patch('pypdfocr.pypdfocr_tesseract.PyTesseract._is_version_uptodate') 120 | @patch('pypdfocr.pypdfocr_tesseract.os.name') 121 | @patch('pypdfocr.pypdfocr_tesseract.os.path.exists') 122 | def test_tesseract_fail(self, mock_os_path_exists, mock_os_name, mock_uptodate, mock_subprocess_call,capsys): 123 | """ 124 | Get all the checks past and make sure we report the case where tesseract returns a non-zero status 125 | """ 126 | mock_os_name.__str__.return_value = 'nt' 127 | p = P.PyTesseract({}) 128 | assert ('tesseract.exe' in p.binary) 129 | 130 | mock_os_path_exists.return_value = True 131 | mock_uptodate.return_value = (True,"") 132 | mock_subprocess_call.return_value = -1 133 | with pytest.raises(SystemExit): 134 | p.make_hocr_from_pnm('blah.tiff') 135 | 136 | out, err = capsys.readouterr() 137 | assert p.msgs['TS_FAILED'] in out 138 | 139 | -------------------------------------------------------------------------------- /test/test_watcher.py: -------------------------------------------------------------------------------- 1 | #from pypdfocr import PyPDFOCR as P 2 | import pypdfocr.pypdfocr_watcher as P 3 | import pytest 4 | 5 | import evernote.api.client 6 | import evernote.edam.type.ttypes as Types 7 | import hashlib 8 | import time 9 | import os 10 | from collections import namedtuple 11 | 12 | from mock import patch, call 13 | 14 | class TestWatching: 15 | 16 | 17 | filenames = [ ("test_recipe.pdf", "test_recipe.pdf"), 18 | (os.path.join("..","test_recipe.pdf"), os.path.join("..","test_recipe.pdf")), 19 | (os.path.join("/", "Volumes","Media", "test_recipe.pdf"), os.path.join("/","Volumes", "Media", "test_recipe.pdf")), 20 | (os.path.join("/", "Volumes", "Media", "test recipe.pdf"), os.path.join("/","Volumes","Media","test_recipe.pdf")), 21 | (os.path.join("..","V olumes","Media", "test recipe.pdf"), os.path.join("..", "V olumes","Media", "test_recipe.pdf")), 22 | ] 23 | 24 | @patch('shutil.move') 25 | @pytest.mark.parametrize(("filename, expected"), filenames) 26 | def test_rename(self, mock_move, filename, expected): 27 | 28 | if expected == None: 29 | expected = filename 30 | 31 | p = P.PyPdfWatcher('temp',{}) 32 | 33 | # First, test code that does not move original 34 | ret = p.rename_file_with_spaces(filename) 35 | assert (ret==expected) 36 | 37 | def test_check_for_new_pdf(self): 38 | 39 | p = P.PyPdfWatcher('temp', {}) 40 | p.check_for_new_pdf("blah_ocr.pdf") 41 | assert("blah_ocr.pdf" not in p.events) 42 | p.check_for_new_pdf("blah.pdf") 43 | assert("blah.pdf" in p.events) 44 | p.events['blah.pdf'] = -1 45 | p.check_for_new_pdf("blah.pdf") 46 | assert("blah.pdf" not in p.events) 47 | p.check_for_new_pdf("blah.pdf") 48 | time.sleep(p.scan_interval+1) 49 | p.check_for_new_pdf("blah.pdf") 50 | assert(p.events['blah.pdf']-time.time() <=1) # Check that time stamp was updated 51 | 52 | def test_events(self): 53 | p = P.PyPdfWatcher('temp', {}) 54 | 55 | event = namedtuple('event', 'src_path, dest_path') 56 | 57 | p.on_created(event(src_path='temp_recipe.pdf', dest_path=None)) 58 | assert('temp_recipe.pdf' in p.events) 59 | 60 | p.on_moved(event(src_path=None, dest_path='temp_recipe2.pdf')) 61 | assert('temp_recipe2.pdf' in p.events) 62 | 63 | p.on_modified(event(src_path='temp_recipe3.pdf', dest_path=None)) 64 | assert('temp_recipe3.pdf' in p.events) 65 | 66 | def test_check_queue(self): 67 | p = P.PyPdfWatcher('temp', {}) 68 | now = time.time() 69 | p.events['blah.pdf'] = now 70 | f = p.check_queue() 71 | assert (not f) 72 | assert ('blah.pdf' in p.events) 73 | time.sleep(p.scan_interval+1) 74 | f = p.check_queue() 75 | assert (f=='blah.pdf') 76 | assert ('blah.pdf' in p.events) 77 | assert (p.events['blah.pdf'] == -1) 78 | f = p.check_queue() 79 | assert ('blah.pdf' not in p.events) 80 | 81 | --------------------------------------------------------------------------------