├── .coveragerc
├── .gitignore
├── .travis.yml
├── CHANGES.rst
├── CHANGES_RECENT.rst
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── TODO.rst
├── dist
    └── pypdfocr.exe
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── make.bat
    └── pypdfocr.rst
├── fabfile.py
├── pypdfocr.spec
├── pypdfocr
    ├── __init__.py
    ├── pypdfocr.py
    ├── pypdfocr.spec
    ├── pypdfocr_filer.py
    ├── pypdfocr_filer_dirs.py
    ├── pypdfocr_filer_evernote.py
    ├── pypdfocr_gs.py
    ├── pypdfocr_interrupts.py
    ├── pypdfocr_multiprocessing.py
    ├── pypdfocr_pdf.py
    ├── pypdfocr_pdffiler.py
    ├── pypdfocr_preprocess.py
    ├── pypdfocr_tesseract.py
    ├── pypdfocr_util.py
    ├── pypdfocr_watcher.py
    └── version.py
├── pypdfocr_windows.spec
├── requirements.txt
├── setup.py
└── test
    ├── pdfs
        ├── 1.pdf
        ├── test.pdf
        ├── test_cinderella.pdf
        ├── test_patent.pdf
        ├── test_recipe.pdf
        ├── test_recipe_sideways.pdf
        ├── test_sherlock.pdf
        └── test_super_long_keyword.pdf
    ├── runtests.py
    ├── temp
        └── original
        │   ├── test_patent.pdf
        │   ├── test_patent_1.pdf
        │   ├── test_recipe.pdf
        │   ├── test_recipe_1.pdf
        │   ├── test_sherlock.pdf
        │   └── test_sherlock_1.pdf
    ├── test_evernote.py
    ├── test_gs.py
    ├── test_option_config.yaml
    ├── test_option_parsing.py
    ├── test_pdf_filer.py
    ├── test_pypdfocr.py
    ├── test_pypdfocr_config.yaml
    ├── test_pypdfocr_config_filename.yaml
    ├── test_pypdfocr_config_no_move_original.yaml
    ├── test_tesseract.py
    └── test_watcher.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | exclude_lines = 
3 | 
4 |     pragma: no cover
5 |     if __name__ == '__main__':
6 |     def error(text):
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .*
3 | *~
4 | *.hocr
5 | *.jpg
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |     - "2.7"
 4 | install: 
 5 |     - "pip install -r requirements.txt --use-mirrors"
 6 |     - "pip install pytest mock --use-mirrors"
 7 |     - "pip install ."
 8 | script: 
 9 |     - "python setup.py test"
10 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
 1 | =======  ========   ======
 2 | Version  Date       Changes
 3 | -------  --------   ------
 4 | 
 5 | v0.9.1   10/11/16   Fixes (#43, #41)
 6 | v0.9.0   2/29/16    Fixed rotated page text, Mac OS X invisible fonts, and pdf merge slowdown
 7 | v0.8.5   2/21/16    Better ctrl-c and cleanup behavior
 8 | v0.8.4   2/18/16    Maintenance release
 9 | v0.8.3   2/18/16    Bug fix for multiprocessing on windows, ctrl-c interrupt, and integer keywords
10 | v0.8.2   12/8/14    Fixed imagemagick invocation on windows.  Parallelized preprocessing and tesseract execution
11 | v0.8.1   12/5/14    Added --skip-preprocess option, scan_interval option, and fixed too many open files bug during page overlay
12 | v0.8.0   10/27/14   Added preprocessing to clean up prior to tesseract, bug fixes on file names with spaces/dots
13 | v0.7.6   9/10/14    Fixed issue 17 rotation bug
14 | v0.7.5   8/18/14    Update for Tesseract 3.03 .hocr filename change
15 | v0.7.4   3/28/14    Bug fix on pdf assembly
16 | v0.7.3   3/27/14    Modified internals to use single image per page (instead of multipage tiff). Also enabled orientation detection
17 | v0.7.2   3/26/14    Switched from Pil to Pillow. Now uses original images from PDF in output pdf (no dpi/color/quality changes!)
18 | v0.7.1   3/25/14    OCR Language is now an option
19 | v0.7.0   3/25/14    Now honors original pdf resolution
20 | v0.6.1   2/16/14    Bug fix for pdfs with only numbers in the filename
21 | v0.6.0   1/16/14    Added filing based on filename match as fallback, added tesseract version check
22 | v0.5.4   1/12/14    Fixed bug with reordering of text pages on certain platforms(glob)
23 | v0.5.3   12/12/13   Fix to evernote server specification
24 | v0.5.2   12/08/13   Fix to lowercase keywords
25 | v0.5.1   11/02/13   Fixed a bunch of windows critical path handling issues
26 | v0.5.0   10/30/13   Email status added, 90% test coverage
27 | v0.4.1   10/28/13   Made HOCR parsing more robust
28 | v0.4.0   10/28/13   Added early Evernote upload support
29 | v0.3.1   10/24/13   Path fix on windows
30 | v0.3.0   10/23/13   Added filing of converted pdfs using a configuration file to specify target directories based on keyword matches in the pdf text
31 | v0.2.2   10/22/13   Added a console script to put the pypdfocr script into your bin
32 | v0.2.1   10/22/13   Fix to initial packaging problem.
33 | v0.2.0   10/21/13   Initial release.
34 | =======  ========   ======
35 | 


--------------------------------------------------------------------------------
/CHANGES_RECENT.rst:
--------------------------------------------------------------------------------
 1 | =======  ========   ======
 2 | Version  Date       Changes
 3 | -------  --------   ------
 4 | 
 5 | v0.9.0   2/29/16    Fixed rotated page text, Mac OS X invisible fonts, and pdf merge slowdown
 6 | v0.8.5   2/21/16    Better ctrl-c and cleanup behavior
 7 | v0.8.4   2/18/16    Maintenance release
 8 | v0.8.3   2/18/16    Bug fix for multiprocessing on windows, ctrl-c interrupt, and integer keywords
 9 | v0.8.2   12/8/14    Fixed imagemagick invocation on windows.  Parallelized preprocessing and tesseract execution
10 | v0.8.1   12/5/14    Added --skip-preprocess option, scan_interval option, and fixed too many open files bug during page overlay
11 | =======  ========   ======
12 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [2013] [Virantha Ekanayake]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.rst
3 | 
4 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | PyPDFOCR - Tesseract-OCR based PDF filing
  2 | =========================================
  3 | 
  4 | |image0| |image1| |image2| |passing| |quality| |Coverage Status|
  5 | 
  6 | This program will help manage your scanned PDFs by doing the following:
  7 | 
  8 | -  Take a scanned PDF file and run OCR on it (using the Tesseract OCR
  9 |    software from Google), generating a searchable PDF
 10 | -  Optionally, watch a folder for incoming scanned PDFs and
 11 |    automatically run OCR on them
 12 | -  Optionally, file the scanned PDFs into directories based on simple
 13 |    keyword matching that you specify
 14 | -  Evernote auto-upload and filing based on keyword search
 15 | -  Email status when it files your PDF
 16 | 
 17 | More links:
 18 | 
 19 | -  `Blog @ virantha.com <http://virantha.com/category/pypdfocr.html>`__
 20 | -  `Documentation @ gitpages <http://virantha.github.com/pypdfocr/html>`__
 21 | -  `Source @ github <https://www.github.com/virantha/pypdfocr>`__
 22 | 
 23 | Usage:
 24 | ######
 25 | 
 26 | Single conversion:
 27 | ~~~~~~~~~~~~~~~~~~
 28 | 
 29 | ::
 30 | 
 31 |     pypdfocr filename.pdf
 32 | 
 33 |     --> filename_ocr.pdf will be generated
 34 | 
 35 | If you have a language pack installed, then you can specify it with the
 36 | ``-l`` option:
 37 | 
 38 | ::
 39 | 
 40 |     pypdfocr -l spa filename.pdf
 41 | 
 42 | Folder monitoring:
 43 | ~~~~~~~~~~~~~~~~~~
 44 | 
 45 | ::
 46 | 
 47 |     pypdfocr -w watch_directory
 48 | 
 49 |     --> Every time a pdf file is added to `watch_directory` it will be OCR'ed
 50 | 
 51 | Automatic filing:
 52 | ~~~~~~~~~~~~~~~~~
 53 | 
 54 | To automatically move the OCR'ed pdf to a directory based on a keyword,
 55 | use the -f option and specify a configuration file (described below):
 56 | 
 57 | ::
 58 | 
 59 |     pypdfocr filename.pdf -f -c config.yaml
 60 | 
 61 | You can also do this in folder monitoring mode:
 62 | 
 63 | ::
 64 | 
 65 |     pypdfocr -w watch_directory -f -c config.yaml
 66 | 
 67 | Filing based on filename match:
 68 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 69 | 
 70 | If no keywords match the contents of the filename, you can optionally
 71 | allow it to fallback to trying to find keyword matches with the PDF
 72 | filename using the -n option. For example, you may have receipts always
 73 | named as ``receipt_2013_12_2.pdf`` by your scanner, and you want to move
 74 | this to a folder called 'receipts'. Assuming you have a keyword
 75 | ``receipt`` matching to folder ``receipts`` in your configuration file
 76 | as described below, you can run the following and have this filed even
 77 | if the content of the pdf does not contain the text 'receipt':
 78 | 
 79 | ::
 80 | 
 81 |     pypdfocr filename.pdf -f -c config.yaml -n
 82 | 
 83 | Configuration file for automatic PDF filing
 84 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 85 | 
 86 | The config.yaml file above is a simple folder to keyword matching text
 87 | file. It determines where your OCR'ed PDFs (and optionally, the original
 88 | scanned PDF) are placed after processing. An example is given below:
 89 | 
 90 | ::
 91 | 
 92 |     target_folder: "docs/filed"
 93 |     default_folder: "docs/filed/manual_sort"
 94 |     original_move_folder: "docs/originals"
 95 | 
 96 |     folders:
 97 |         finances:
 98 |             - american express
 99 |             - chase card
100 |             - internal revenue service
101 |         travel:
102 |             - boarding pass
103 |             - airlines
104 |             - expedia
105 |             - orbitz
106 |         receipts:
107 |             - receipt
108 | 
109 | The ``target_folder`` is the root of your filing cabinet. Any PDF moving
110 | will happen in sub-directories under this directory.
111 | 
112 | The ``folders`` section defines your filing directories and the keywords
113 | associated with them. In this example, we have three filing directories
114 | (finances, travl, receipts), and some associated keywords for each
115 | filing directory. For example, if your OCR'ed PDF contains the phrase
116 | "american express" (in any upper/lower case), it will be filed into
117 | ``docs/filed/finances``
118 | 
119 | The ``default_folder`` is where the OCR'ed PDF is moved to if there is
120 | no keyword match.
121 | 
122 | The ``original_move_folder`` is optional (you can comment it out with
123 | ``#`` in front of that line), but if specified, the original scanned PDF
124 | is moved into this directory after OCR is done. Otherwise, if this field
125 | is not present or commented out, your original PDF will stay where it
126 | was found.
127 | 
128 | If there is any naming conflict during filing, the program will add an
129 | underscore followed by a number to each filename, in order to avoid
130 | overwriting files that may already be present.
131 | 
132 | Evernote upload:
133 | ~~~~~~~~~~~~~~~~
134 | 
135 | Evernote authentication token
136 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
137 | 
138 | To enable Evernote support, you will need to `get a developer token for
139 | your Evernote
140 | account. <https://www.evernote.com/api/DeveloperToken.action>`__. You
141 | should note that this script will never delete or modify existing notes
142 | in your account, and limits itself to creating new Notebooks and Notes.
143 | Once you get that token, you copy and paste it into your configuration
144 | file as shown below
145 | 
146 | Evernote filing usage
147 | ^^^^^^^^^^^^^^^^^^^^^
148 | 
149 | To automatically upload the OCR'ed pdf to a folder based on a keyword,
150 | use the ``-e`` option instead of the ``-f`` auto filing option.
151 | 
152 | ::
153 | 
154 |     pypdfocr filename.pdf -e -c config.yaml
155 | 
156 | Similarly, you can also do this in folder monitoring mode:
157 | 
158 | ::
159 | 
160 |     pypdfocr -w watch_directory -e -c config.yaml
161 | 
162 | Evernote filing configuration file
163 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
164 | 
165 | The config file shown above only needs to change slightly. The folders
166 | section is completely unchanged, but note that ``target_folder`` is the
167 | name of your "Notebook stack" in Evernote, and the ``default_folder``
168 | should just be the default Evernote upload notebook name.
169 | 
170 | ::
171 | 
172 |     target_folder: "evernote_stack"
173 |     default_folder: "default"
174 |     original_move_folder: "docs/originals"
175 |     evernote_developer_token: "YOUR_TOKEN"
176 | 
177 |     folders:
178 |         finances:
179 |             - american express
180 |             - chase card
181 |             - internal revenue service
182 |         travel:
183 |             - boarding pass
184 |             - airlines
185 |             - expedia
186 |             - orbitz
187 |         receipts:
188 |             - receipt
189 | 
190 | Auto email
191 | ~~~~~~~~~~
192 | 
193 | You can have PyPDFOCR email you everytime it converts a file and files
194 | it. You need to first specify the following lines in the configuration
195 | file and then use the ``-m`` option when invoking ``pypdfocr``:
196 | 
197 | ::
198 | 
199 |     mail_smtp_server: "smtp.gmail.com:587"
200 |     mail_smtp_login: "virantha@gmail.com"
201 |     mail_smtp_password: "PASSWORD"
202 |     mail_from_addr: "virantha@gmail.com"
203 |     mail_to_list: 
204 |         - "virantha@gmail.com"
205 |         - "person2@gmail.com"
206 | 
207 | 
208 | Advanced options
209 | ################
210 | 
211 | Fine-tuning Tesseract/Ghostscript/others
212 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
213 | 
214 | You can specify Tesseract and Ghostscript executable locations manually, as
215 | well as the number of concurrent processes allowed during preprocessing and
216 | tesseract.  Use the following in your configuration file:
217 | 
218 | ::
219 | 
220 |     tesseract:
221 |         binary: "/usr/bin/tesseract"
222 |         threads: 8
223 | 
224 |     ghostscript:
225 |         binary: "/usr/local/bin/gs"
226 | 
227 |     preprocess:
228 |         threads: 8
229 | 
230 | Handling disk time-outs
231 | ~~~~~~~~~~~~~~~~~~~~~~~
232 | If you need to increase the time interval (default 3 seconds) between new
233 | document scans when pypdfocr is watching a directory, you can specify the following
234 | option in the configuration file:
235 | 
236 | ::
237 |     
238 |     watch:
239 |         scan_interval: 6
240 | 
241 | Installation
242 | ############
243 | 
244 | Using pip
245 | ~~~~~~~~~
246 | 
247 | PyPDFOCR is available in PyPI, so you can just run:
248 | 
249 | ::
250 | 
251 |     pip install pypdfocr
252 | 
253 | Please note that some of the 3rd-party libraries required by PyPDFOCR wiill
254 | require some build tools, especially on a default Ubuntu system.  If you run
255 | into any issues using pip install, you may want to install the
256 | following packages on Ubuntu and try again:
257 | 
258 | - gcc
259 | - libjpeg-dev
260 | - zlib-bin
261 | - zlib1g-dev
262 | - python-dev
263 | 
264 | For those on **Windows**, because it's such a pain to get all the PIL
265 | and PDF dependencies installed, I've gone ahead and made an executable
266 | called
267 | `pypdfocr.exe <https://github.com/virantha/pypdfocr/blob/master/dist/pypdfocr.exe?raw=true>`__
268 | 
269 | You still need to install Tesseract, GhostScript, etc. as detailed below in
270 | the external dependencies list.
271 | 
272 | Manual install
273 | ~~~~~~~~~~~~~~
274 | 
275 | Clone the source directly from github (you need to have git installed):
276 | 
277 | ::
278 | 
279 |     git clone https://github.com/virantha/pypdfocr.git
280 | 
281 | Then, install the following third-party python libraries:
282 | 
283 | -  Pillow (Python Imaging Library) https://pillow.readthedocs.org/en/3.1.x/
284 | -  ReportLab (PDF generation library)
285 |    http://www.reportlab.com/opensource/
286 | -  Watchdog (Cross-platform fhlesystem events monitoring)
287 |    https://pypi.python.org/pypi/watchdog
288 | -  PyPDF2 (Pure python pdf library)
289 | 
290 | These can all be installed via pip:
291 | 
292 | ::
293 | 
294 |     pip install Pillow
295 |     pip install reportlab
296 |     pip install watchdog
297 |     pip install pypdf2
298 | 
299 | 
300 | You will also need to install the external dependencies listed below.
301 | 
302 | External Dependencies
303 | ~~~~~~~~~~~~~~~~~~~~~
304 | 
305 | PyPDFOCR relies on the following (free) programs being installed and in
306 | the path:
307 | 
308 | -  Tesseract OCR software https://code.google.com/p/tesseract-ocr/
309 | -  GhostScript http://www.ghostscript.com/
310 | -  ImageMagick http://www.imagemagick.org/
311 | -  Poppler http://poppler.freedesktop.org/  (`Windows <http://sourceforge.net/projects/poppler-win32/>`__)
312 | 
313 | Poppler is only required if you want pypdfocr to figure out the original PDF resolution
314 | automatically; just make sure you have ``pdfimages`` in your path.   Note that the 
315 | `xpdf <http://www.foolabs.com/xpdf/download.html>`__ provided ``pdfimages`` does not work for this, 
316 | because it does not support the ``-list`` option to list the table of images in a PDF file.
317 | 
318 | On Mac OS X, you can install these using homebrew:
319 | 
320 | ::
321 | 
322 |     brew install tesseract
323 |     brew install ghostscript
324 |     brew install poppler
325 |     brew install imagemagick
326 | 
327 | On Windows, please use the installers provided on their download pages.
328 | 
329 | \*\* Important \*\* Tesseract version 3.02.02 or newer required
330 | (apparently 3.02.01-6 and possibly others do not work due to a hocr
331 | output format change that I'm not planning to address). On Ubuntu, you
332 | may need to compile and install it manually by following `these
333 | instructions <http://miphol.com/muse/2013/05/install-tesseract-ocr-on-ubunt.html>`__
334 | 
335 | Also note that if you want Tesseract to recognize rotated documents (upside down, or rotated 90 degrees)
336 | then you need to find your tessdata directory and do the following:
337 | 
338 | ::
339 | 
340 |     cd /usr/local/share/tessdata 
341 |     cp eng.traineddata osd.traineddata 
342 | 
343 | ``osd`` stands for Orientation and Script Detection, so you need to copy the .traineddata
344 | for whatever language you want to scan in as ``osd.traineddata``.  If you don't do this step, 
345 | then any landscape document will produce garbage
346 | 
347 | Disclaimer
348 | ##########
349 | 
350 | While test coverage is at 84% right now, Sphinx docs generation is at an
351 | early stage. The software is distributed on an "AS IS" BASIS, WITHOUT
352 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
353 | 
354 | .. |image0| image:: https://badge.fury.io/py/pypdfocr.png
355 |    :target: https://pypi.python.org/pypi/pypdfocr
356 | .. |image1| image:: https://pypip.in/d/pypdfocr/badge.png
357 | .. |image2| image:: https://pypip.in/license/pypdfocr/badge.png
358 | .. |passing| image:: https://scrutinizer-ci.com/g/virantha/pypdfocr/badges/build.png?b=master
359 | .. |quality| image:: https://scrutinizer-ci.com/g/virantha/pypdfocr/badges/quality-score.png?b=master
360 | .. |Coverage Status| image:: https://coveralls.io/repos/virantha/pypdfocr/badge.png?branch=develop
361 |    :target: https://coveralls.io/r/virantha/pypdfocr
362 | 


--------------------------------------------------------------------------------
/TODO.rst:
--------------------------------------------------------------------------------
 1 | Todo list
 2 | =========
 3 | 
 4 | - #43 version check for tesseract
 5 | - On windows, search for pdfimages and imagemagick instead of relying on path
 6 | - Split up into flow steps  
 7 | - Run more robustness tests for watching networked shares
 8 | - Add more docstrings
 9 | - Add more option specifiers to tesseract and ghostscript
10 | 


--------------------------------------------------------------------------------
/dist/pypdfocr.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/dist/pypdfocr.exe


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = /Users/virantha/dev/githubdocs/pypdfocr
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | dirhtml:
 58 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 61 | 
 62 | singlehtml:
 63 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 66 | 
 67 | pickle:
 68 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 69 | 	@echo
 70 | 	@echo "Build finished; now you can process the pickle files."
 71 | 
 72 | json:
 73 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the JSON files."
 76 | 
 77 | htmlhelp:
 78 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 79 | 	@echo
 80 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 81 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 82 | 
 83 | qthelp:
 84 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 85 | 	@echo
 86 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 87 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 88 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pypdfocr.qhcp"
 89 | 	@echo "To view the help file:"
 90 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pypdfocr.qhc"
 91 | 
 92 | devhelp:
 93 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 94 | 	@echo
 95 | 	@echo "Build finished."
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/pypdfocr"
 98 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pypdfocr"
 99 | 	@echo "# devhelp"
100 | 
101 | epub:
102 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
103 | 	@echo
104 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
105 | 
106 | latex:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo
109 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
110 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
111 | 	      "(use \`make latexpdf' here to do that automatically)."
112 | 
113 | latexpdf:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo "Running LaTeX files through pdflatex..."
116 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
117 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
118 | 
119 | latexpdfja:
120 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
121 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
122 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
123 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
124 | 
125 | text:
126 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
127 | 	@echo
128 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
129 | 
130 | man:
131 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
132 | 	@echo
133 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
134 | 
135 | texinfo:
136 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
137 | 	@echo
138 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
139 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
140 | 	      "(use \`make info' here to do that automatically)."
141 | 
142 | info:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo "Running Texinfo files through makeinfo..."
145 | 	make -C $(BUILDDIR)/texinfo info
146 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
147 | 
148 | gettext:
149 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
150 | 	@echo
151 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
152 | 
153 | changes:
154 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
155 | 	@echo
156 | 	@echo "The overview file is in $(BUILDDIR)/changes."
157 | 
158 | linkcheck:
159 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
160 | 	@echo
161 | 	@echo "Link check complete; look for any errors in the above output " \
162 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
163 | 
164 | doctest:
165 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
166 | 	@echo "Testing of doctests in the sources finished, look at the " \
167 | 	      "results in $(BUILDDIR)/doctest/output.txt."
168 | 
169 | xml:
170 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
171 | 	@echo
172 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
173 | 
174 | pseudoxml:
175 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
176 | 	@echo
177 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
178 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # pypdfocr documentation build configuration file, created by
  4 | # sphinx-quickstart on Wed Oct 23 13:43:29 2013.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | import pkg_resources
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | #sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | extensions = [
 33 |     'sphinx.ext.autodoc',
 34 |     'sphinx.ext.viewcode',
 35 | ]
 36 | 
 37 | # Add any paths that contain templates here, relative to this directory.
 38 | templates_path = ['_templates']
 39 | 
 40 | # The suffix of source filenames.
 41 | source_suffix = '.rst'
 42 | 
 43 | # The encoding of source files.
 44 | #source_encoding = 'utf-8-sig'
 45 | 
 46 | # The master toctree document.
 47 | master_doc = 'index'
 48 | 
 49 | # General information about the project.
 50 | project = u'pypdfocr'
 51 | copyright = u'2013, Author'
 52 | 
 53 | # The version info for the project you're documenting, acts as replacement for
 54 | # |version| and |release|, also used in various other places throughout the
 55 | # built documents.
 56 | #
 57 | # The short X.Y version.
 58 | version = ''
 59 | try:
 60 |     release = pkg_resources.get_distribution('pypdfocr').version
 61 | except pkg_resources.DistributionNotFound:
 62 |     print 'To build the documentation, The distribution information of sandman'
 63 |     print 'Has to be available.  Either install the package into your'
 64 |     print 'development environment or run "setup.py develop" to setup the'
 65 |     print 'metadata.  A virtualenv is recommended!'
 66 |     sys.exit(1)
 67 | del pkg_resources
 68 | 
 69 | version = '.'.join(release.split('.')[:2])
 70 | # The full version, including alpha/beta/rc tags.
 71 | 
 72 | # The language for content autogenerated by Sphinx. Refer to documentation
 73 | # for a list of supported languages.
 74 | #language = None
 75 | 
 76 | # There are two options for replacing |today|: either, you set today to some
 77 | # non-false value, then it is used:
 78 | #today = ''
 79 | # Else, today_fmt is used as the format for a strftime call.
 80 | #today_fmt = '%B %d, %Y'
 81 | 
 82 | # List of patterns, relative to source directory, that match files and
 83 | # directories to ignore when looking for source files.
 84 | exclude_patterns = ['_build']
 85 | 
 86 | # The reST default role (used for this markup: `text`) to use for all
 87 | # documents.
 88 | #default_role = None
 89 | 
 90 | # If true, '()' will be appended to :func: etc. cross-reference text.
 91 | #add_function_parentheses = True
 92 | 
 93 | # If true, the current module name will be prepended to all description
 94 | # unit titles (such as .. function::).
 95 | #add_module_names = True
 96 | 
 97 | # If true, sectionauthor and moduleauthor directives will be shown in the
 98 | # output. They are ignored by default.
 99 | #show_authors = False
100 | 
101 | # The name of the Pygments (syntax highlighting) style to use.
102 | pygments_style = 'sphinx'
103 | 
104 | # A list of ignored prefixes for module index sorting.
105 | #modindex_common_prefix = []
106 | 
107 | # If true, keep warnings as "system message" paragraphs in the built documents.
108 | #keep_warnings = False
109 | 
110 | 
111 | # -- Options for HTML output ----------------------------------------------
112 | 
113 | # The theme to use for HTML and HTML Help pages.  See the documentation for
114 | # a list of builtin themes.
115 | html_theme = 'sphinxdoc'
116 | 
117 | # Theme options are theme-specific and customize the look and feel of a theme
118 | # further.  For a list of options available for each theme, see the
119 | # documentation.
120 | #html_theme_options = {}
121 | 
122 | # Add any paths that contain custom themes here, relative to this directory.
123 | #html_theme_path = []
124 | 
125 | # The name for this set of Sphinx documents.  If None, it defaults to
126 | # "<project> v<release> documentation".
127 | #html_title = None
128 | 
129 | # A shorter title for the navigation bar.  Default is the same as html_title.
130 | #html_short_title = None
131 | 
132 | # The name of an image file (relative to this directory) to place at the top
133 | # of the sidebar.
134 | #html_logo = None
135 | 
136 | # The name of an image file (within the static path) to use as favicon of the
137 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
138 | # pixels large.
139 | #html_favicon = None
140 | 
141 | # Add any paths that contain custom static files (such as style sheets) here,
142 | # relative to this directory. They are copied after the builtin static files,
143 | # so a file named "default.css" will overwrite the builtin "default.css".
144 | html_static_path = ['_static']
145 | 
146 | # Add any extra paths that contain custom files (such as robots.txt or
147 | # .htaccess) here, relative to this directory. These files are copied
148 | # directly to the root of the documentation.
149 | #html_extra_path = []
150 | 
151 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
152 | # using the given strftime format.
153 | #html_last_updated_fmt = '%b %d, %Y'
154 | 
155 | # If true, SmartyPants will be used to convert quotes and dashes to
156 | # typographically correct entities.
157 | #html_use_smartypants = True
158 | 
159 | # Custom sidebar templates, maps document names to template names.
160 | #html_sidebars = {}
161 | 
162 | # Additional templates that should be rendered to pages, maps page names to
163 | # template names.
164 | #html_additional_pages = {}
165 | 
166 | # If false, no module index is generated.
167 | #html_domain_indices = True
168 | 
169 | # If false, no index is generated.
170 | #html_use_index = True
171 | 
172 | # If true, the index is split into individual pages for each letter.
173 | #html_split_index = False
174 | 
175 | # If true, links to the reST sources are added to the pages.
176 | #html_show_sourcelink = True
177 | 
178 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
179 | #html_show_sphinx = True
180 | 
181 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
182 | #html_show_copyright = True
183 | 
184 | # If true, an OpenSearch description file will be output, and all pages will
185 | # contain a <link> tag referring to it.  The value of this option must be the
186 | # base URL from which the finished HTML is served.
187 | #html_use_opensearch = ''
188 | 
189 | # This is the file name suffix for HTML files (e.g. ".xhtml").
190 | #html_file_suffix = None
191 | 
192 | # Output file base name for HTML help builder.
193 | htmlhelp_basename = 'pypdfocrdoc'
194 | 
195 | 
196 | # -- Options for LaTeX output ---------------------------------------------
197 | 
198 | latex_elements = {
199 | # The paper size ('letterpaper' or 'a4paper').
200 | #'papersize': 'letterpaper',
201 | 
202 | # The font size ('10pt', '11pt' or '12pt').
203 | #'pointsize': '10pt',
204 | 
205 | # Additional stuff for the LaTeX preamble.
206 | #'preamble': '',
207 | }
208 | 
209 | # Grouping the document tree into LaTeX files. List of tuples
210 | # (source start file, target name, title,
211 | #  author, documentclass [howto, manual, or own class]).
212 | latex_documents = [
213 |   ('index', 'pypdfocr.tex', u'pypdfocr Documentation',
214 |    u'Author', 'manual'),
215 | ]
216 | 
217 | # The name of an image file (relative to this directory) to place at the top of
218 | # the title page.
219 | #latex_logo = None
220 | 
221 | # For "manual" documents, if this is true, then toplevel headings are parts,
222 | # not chapters.
223 | #latex_use_parts = False
224 | 
225 | # If true, show page references after internal links.
226 | #latex_show_pagerefs = False
227 | 
228 | # If true, show URL addresses after external links.
229 | #latex_show_urls = False
230 | 
231 | # Documents to append as an appendix to all manuals.
232 | #latex_appendices = []
233 | 
234 | # If false, no module index is generated.
235 | #latex_domain_indices = True
236 | 
237 | 
238 | # -- Options for manual page output ---------------------------------------
239 | 
240 | # One entry per manual page. List of tuples
241 | # (source start file, name, description, authors, manual section).
242 | man_pages = [
243 |     ('index', 'pypdfocr', u'pypdfocr Documentation',
244 |      [u'Author'], 1)
245 | ]
246 | 
247 | # If true, show URL addresses after external links.
248 | #man_show_urls = False
249 | 
250 | 
251 | # -- Options for Texinfo output -------------------------------------------
252 | 
253 | # Grouping the document tree into Texinfo files. List of tuples
254 | # (source start file, target name, title, author,
255 | #  dir menu entry, description, category)
256 | texinfo_documents = [
257 |   ('index', 'pypdfocr', u'pypdfocr Documentation',
258 |    u'Author', 'pypdfocr', 'One line description of project.',
259 |    'Miscellaneous'),
260 | ]
261 | 
262 | # Documents to append as an appendix to all manuals.
263 | #texinfo_appendices = []
264 | 
265 | # If false, no module index is generated.
266 | #texinfo_domain_indices = True
267 | 
268 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
269 | #texinfo_show_urls = 'footnote'
270 | 
271 | # If true, do not generate a @detailmenu in the "Top" node's menu.
272 | #texinfo_no_detailmenu = False
273 | 
274 | 
275 | # -- Options for Epub output ----------------------------------------------
276 | 
277 | # Bibliographic Dublin Core info.
278 | epub_title = u'pypdfocr'
279 | epub_author = u'Author'
280 | epub_publisher = u'Author'
281 | epub_copyright = u'2013, Author'
282 | 
283 | # The basename for the epub file. It defaults to the project name.
284 | #epub_basename = u'pypdfocr'
285 | 
286 | # The HTML theme for the epub output. Since the default themes are not optimized
287 | # for small screen space, using the same theme for HTML and epub output is
288 | # usually not wise. This defaults to 'epub', a theme designed to save visual
289 | # space.
290 | #epub_theme = 'epub'
291 | 
292 | # The language of the text. It defaults to the language option
293 | # or en if the language is not set.
294 | #epub_language = ''
295 | 
296 | # The scheme of the identifier. Typical schemes are ISBN or URL.
297 | #epub_scheme = ''
298 | 
299 | # The unique identifier of the text. This can be a ISBN number
300 | # or the project homepage.
301 | #epub_identifier = ''
302 | 
303 | # A unique identification for the text.
304 | #epub_uid = ''
305 | 
306 | # A tuple containing the cover image and cover page html template filenames.
307 | #epub_cover = ()
308 | 
309 | # A sequence of (type, uri, title) tuples for the guide element of content.opf.
310 | #epub_guide = ()
311 | 
312 | # HTML files that should be inserted before the pages created by sphinx.
313 | # The format is a list of tuples containing the path and title.
314 | #epub_pre_files = []
315 | 
316 | # HTML files shat should be inserted after the pages created by sphinx.
317 | # The format is a list of tuples containing the path and title.
318 | #epub_post_files = []
319 | 
320 | # A list of files that should not be packed into the epub file.
321 | #epub_exclude_files = []
322 | 
323 | # The depth of the table of contents in toc.ncx.
324 | #epub_tocdepth = 3
325 | 
326 | # Allow duplicate toc entries.
327 | #epub_tocdup = True
328 | 
329 | # Choose between 'default' and 'includehidden'.
330 | #epub_tocscope = 'default'
331 | 
332 | # Fix unsupported image types using the PIL.
333 | #epub_fix_images = False
334 | 
335 | # Scale large images.
336 | #epub_max_image_width = 0
337 | 
338 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
339 | #epub_show_urls = 'inline'
340 | 
341 | # If false, no index is generated.
342 | #epub_use_index = True
343 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. pypdfocr documentation master file, created by
 2 |    sphinx-quickstart on Wed Oct 23 13:43:29 2013.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | PyPDFOCR API Reference (version |release|)
 7 | ==========================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 4
13 | 
14 |    pypdfocr
15 | 
16 | Recent Changes
17 | ==============
18 | .. include:: ../CHANGES_RECENT.rst
19 | 
20 | 
21 | Testing
22 | ================
23 |     `Coverage <http://virantha.github.io/pypdfocr/html/testing/index.html>`_
24 | 
25 | .. include:: ../README.rst
26 | 
27 | Changelog
28 | =========
29 | .. include:: ../CHANGES.rst
30 | 
31 | .. include:: ../TODO.rst
32 | 
33 | Indices and tables
34 | ==================
35 | 
36 | * :ref:`genindex`
37 | * :ref:`modindex`
38 | * :ref:`search`
39 | 
40 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pypdfocr.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pypdfocr.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/docs/pypdfocr.rst:
--------------------------------------------------------------------------------
  1 | pypdfocr package
  2 | ================
  3 | 
  4 | Submodules
  5 | ----------
  6 | 
  7 | pypdfocr.pypdfocr module
  8 | ------------------------
  9 | 
 10 | .. automodule:: pypdfocr.pypdfocr
 11 |     :members:
 12 |     :undoc-members:
 13 |     :show-inheritance:
 14 |     :private-members:
 15 | 
 16 | pypdfocr.pypdfocr_gs module
 17 | ---------------------------
 18 | 
 19 | .. automodule:: pypdfocr.pypdfocr_gs
 20 |     :members:
 21 |     :undoc-members:
 22 |     :show-inheritance:
 23 |     :private-members:
 24 | 
 25 | pypdfocr.pypdfocr_pdf module
 26 | ----------------------------
 27 | 
 28 | .. automodule:: pypdfocr.pypdfocr_pdf
 29 |     :members:
 30 |     :undoc-members:
 31 |     :show-inheritance:
 32 |     :private-members:
 33 | 
 34 | pypdfocr.pypdfocr_pdffiler module
 35 | ---------------------------------
 36 | 
 37 | .. automodule:: pypdfocr.pypdfocr_pdffiler
 38 |     :members:
 39 |     :undoc-members:
 40 |     :show-inheritance:
 41 |     :private-members:
 42 | 
 43 | pypdfocr.pypdfocr_tesseract module
 44 | ----------------------------------
 45 | 
 46 | .. automodule:: pypdfocr.pypdfocr_tesseract
 47 |     :members:
 48 |     :undoc-members:
 49 |     :show-inheritance:
 50 |     :private-members:
 51 | 
 52 | pypdfocr.pypdfocr_watcher module
 53 | --------------------------------
 54 | 
 55 | .. automodule:: pypdfocr.pypdfocr_watcher
 56 |     :members:
 57 |     :undoc-members:
 58 |     :show-inheritance:
 59 |     :private-members:
 60 | 
 61 | pypdfocr.pypdfocr_preprocess module
 62 | -----------------------------------
 63 | 
 64 | .. automodule:: pypdfocr.pypdfocr_preprocess
 65 |     :members:
 66 |     :undoc-members:
 67 |     :show-inheritance:
 68 |     :private-members:
 69 | 
 70 | pypdfocr.pypdfocr_filer module
 71 | --------------------------------
 72 | 
 73 | .. automodule:: pypdfocr.pypdfocr_filer
 74 |     :members:
 75 |     :undoc-members:
 76 |     :show-inheritance:
 77 |     :private-members:
 78 | 
 79 | pypdfocr.pypdfocr_filer_dirs module
 80 | ------------------------------------
 81 | 
 82 | .. automodule:: pypdfocr.pypdfocr_filer_dirs
 83 |     :members:
 84 |     :undoc-members:
 85 |     :show-inheritance:
 86 |     :private-members:
 87 | 
 88 | pypdfocr.pypdfocr_filer_evernote module
 89 | ----------------------------------------
 90 | 
 91 | .. automodule:: pypdfocr.pypdfocr_filer_evernote
 92 |     :members:
 93 |     :undoc-members:
 94 |     :show-inheritance:
 95 |     :private-members:
 96 | 
 97 |     .. automethod:: _check_and_make_notebook(self,notebook_name)
 98 | 
 99 | Module contents
100 | ---------------
101 | 
102 | .. automodule:: pypdfocr
103 |     :members:
104 |     :undoc-members:
105 |     :show-inheritance:
106 |     :private-members:
107 | 


--------------------------------------------------------------------------------
/fabfile.py:
--------------------------------------------------------------------------------
 1 | from fabric.api import *
 2 | import os
 3 |  
 4 |   
 5 | def build_windows_dist():
 6 |     if os.name == 'nt':
 7 |         # Call the pyinstaller
 8 |         local("python ../pyinstaller/pyinstaller.py pypdfocr_windows.spec --onefile")
 9 | 
10 | 
11 | def run_tests():
12 |     test_dir = "test"
13 |     with lcd(test_dir):
14 |         # Regenerate the test script
15 |         local("py.test --genscript=runtests.py")
16 |         t = local("py.test --cov-config .coveragerc --cov=pypdfocr --cov-report=term --cov-report=html", capture=False)
17 |         t = local("coveralls")
18 | 
19 |         #with open("test/COVERAGE.rst", "w") as f:
20 |             #f.write(t)
21 | 
22 | 
23 | def push_docs():
24 |     """ Build the sphinx docs from develop
25 |         And push it to gh-pages
26 |     """
27 |     githubpages = "/Users/virantha/dev/githubdocs/pypdfocr"
28 |     # Convert markdown readme to rst
29 |     #local("pandoc README.md -f markdown -t rst -o README.rst")
30 |     with lcd(githubpages):
31 |         local("git checkout gh-pages")
32 |         local("git pull origin gh-pages")
33 |     local("head CHANGES.rst > CHANGES_RECENT.rst")
34 |     local("tail -n 1 CHANGES.rst >> CHANGES_RECENT.rst")
35 |     with lcd("docs"):
36 |         print("Running sphinx in docs/ and building to ~/dev/githubpages/pypdfocr")
37 |         local("make clean")
38 |         local("make html")
39 |         local("cp -R ../test/htmlcov %s/html/testing" % githubpages)
40 |     with lcd(githubpages):
41 |         local("git add .")
42 |         local('git commit -am "doc update"')
43 |         local('git push origin gh-pages')
44 | 
45 | 


--------------------------------------------------------------------------------
/pypdfocr.spec:
--------------------------------------------------------------------------------
 1 | # -*- mode: python -*-
 2 | a = Analysis(['pypdfocr/pypdfocr.py'],
 3 |              pathex=['/Users/virantha/dev/ocr'],
 4 |              hiddenimports=[],
 5 |              hookspath=None)
 6 | pyz = PYZ(a.pure)
 7 | exe = EXE(pyz,
 8 |           a.scripts,
 9 |           exclude_binaries=1,
10 |           name=os.path.join('build/pyi.darwin/pypdfocr', 'pypdfocr'),
11 |           debug=False,
12 |           strip=None,
13 |           upx=True,
14 |           console=True )
15 | coll = COLLECT(exe,
16 |                a.binaries,
17 |                a.zipfiles,
18 |                a.datas,
19 |                strip=None,
20 |                upx=True,
21 |                name=os.path.join('dist', 'pypdfocr'))
22 | 


--------------------------------------------------------------------------------
/pypdfocr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/pypdfocr/__init__.py


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import smtplib
 17 | import argparse
 18 | import sys, os, traceback, time
 19 | import logging
 20 | import shutil, glob
 21 | import itertools
 22 | from functools import wraps
 23 | 
 24 | from version import __version__
 25 | from PIL import Image
 26 | import yaml
 27 | 
 28 | import multiprocessing
 29 | # Replace the Popen routine to allow win32 pyinstaller to build
 30 | from multiprocessing import forking
 31 | from pypdfocr_multiprocessing import _Popen
 32 | forking.Popen = _Popen
 33 | 
 34 | from pypdfocr_pdf import PyPdf
 35 | from pypdfocr_tesseract import PyTesseract
 36 | from pypdfocr_gs import PyGs
 37 | from pypdfocr_watcher import PyPdfWatcher
 38 | from pypdfocr_pdffiler import PyPdfFiler
 39 | from pypdfocr_filer_dirs import PyFilerDirs
 40 | from pypdfocr_filer_evernote import PyFilerEvernote
 41 | from pypdfocr_preprocess import PyPreprocess
 42 | 
 43 | def error(text):
 44 |     print("ERROR: %s" % text)
 45 |     sys.exit(-1)
 46 | 
 47 | # decorator to retry multiple times
 48 | def retry(count=5, exc_type = Exception):
 49 |     def decorator(func):
 50 |         @wraps(func)
 51 |         def result(*args, **kwargs):
 52 |             for _ in range(count):
 53 |                 try:
 54 |                     return func(*args, **kwargs)
 55 |                 except exc_type:
 56 |                     pass
 57 |                 raise
 58 |         return result
 59 |     return decorator
 60 | 
 61 | @retry(count=6, exc_type=IOError)
 62 | def open_file_with_timeout(parser, arg):
 63 |     f = open(arg, 'r')
 64 |     return f
 65 | 
 66 | """
 67 |     Make scanned PDFs searchable using Tesseract-OCR and autofile them
 68 | .. automodule:: pypdfocr
 69 |     :private-members:
 70 | """
 71 | 
 72 | class PyPDFOCR(object):
 73 |     """
 74 |         The main clas.  Performs the following functions:
 75 | 
 76 |         * Parses command line options
 77 |         * Optionally just watches a directory for new PDF's to OCR; once a file appears, it does the next step
 78 |         * Runs a single file conversion:
 79 |             * Runs ghostscript to get tiff/jpg
 80 |             * Runs Tesseract-OCR to do the actual OCR
 81 |             * Takes the HOCR from Tesseract and creates a new PDF with the text overlay
 82 |         * Files the OCR'ed file in the proper place if specified
 83 |         * Files the original file if specified
 84 |         * 
 85 |     """
 86 | 
 87 |     def __init__ (self):
 88 |         """ Initializes the GhostScript, Tesseract, and PDF helper classes.
 89 |         """
 90 |         self.config = {}
 91 | 
 92 |     def _get_config_file(self, config_file):
 93 |         """
 94 |            Read in the yaml config file
 95 | 
 96 |            :param config_file: Configuration file (YAML format)
 97 |            :type config_file: file
 98 |            :returns: dict of yaml file
 99 |            :rtype: dict
100 |         """
101 |         with config_file:
102 |             myconfig = yaml.load(config_file)
103 |         return myconfig
104 | 
105 | 
106 | 
107 |     def get_options(self, argv):
108 |         """
109 |             Parse the command-line options and set the following object properties:
110 | 
111 |             :param argv: usually just sys.argv[1:]
112 |             :returns: Nothing
113 | 
114 |             :ivar debug: Enable logging debug statements
115 |             :ivar verbose: Enable verbose logging
116 |             :ivar enable_filing: Whether to enable post-OCR filing of PDFs
117 |             :ivar pdf_filename: Filename for single conversion mode
118 |             :ivar watch_dir: Directory to watch for files to convert
119 |             :ivar config: Dict of the config file
120 |             :ivar watch: Whether folder watching mode is turned on
121 |             :ivar enable_evernote: Enable filing to evernote
122 | 
123 |         """
124 |         p = argparse.ArgumentParser(
125 |                 description = "Convert scanned PDFs into their OCR equivalent.  Depends on GhostScript and Tesseract-OCR being installed.",
126 |                 epilog = "PyPDFOCR version %s (Copyright 2013 Virantha Ekanayake)" % __version__,
127 |                 )
128 | 
129 |         p.add_argument('-d', '--debug', action='store_true',
130 |             default=False, dest='debug', help='Turn on debugging')
131 | 
132 |         p.add_argument('-v', '--verbose', action='store_true',
133 |             default=False, dest='verbose', help='Turn on verbose mode')
134 | 
135 |         p.add_argument('-m', '--mail', action='store_true',
136 |             default=False, dest='mail', help='Send email after conversion')
137 | 
138 |         p.add_argument('-l', '--lang',
139 |             default='eng', dest='lang', help='Language(default eng)')
140 | 
141 | 
142 |         p.add_argument('--preprocess', action='store_true',
143 |                 default=False, dest='preprocess', help='Enable preprocessing.  Not really useful now with improved Tesseract 3.04+')
144 |         
145 |         p.add_argument('--skip-preprocess', action='store_true',
146 |                 default=False, dest='skip_preprocess', help='DEPRECATED: always skips now.')
147 | 
148 |         #---------
149 |         # Single or watch mode
150 |         #--------
151 |         single_or_watch_group = p.add_mutually_exclusive_group(required=True)
152 |         # Positional argument for single file conversion
153 |         single_or_watch_group.add_argument("pdf_filename", nargs="?", help="Scanned pdf file to OCR")
154 |         # Watch directory for watch mode
155 |         single_or_watch_group.add_argument('-w', '--watch', 
156 |              dest='watch_dir', help='Watch given directory and run ocr automatically until terminated')
157 | 
158 |         #-----------
159 |         # Filing options
160 |         #----------
161 |         filing_group = p.add_argument_group(title="Filing optinos")
162 |         filing_group.add_argument('-f', '--file', action='store_true',
163 |             default=False, dest='enable_filing', help='Enable filing of converted PDFs')
164 |         #filing_group.add_argument('-c', '--config', type = argparse.FileType('r'),
165 |         filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x),
166 |              dest='configfile', help='Configuration file for defaults and PDF filing')
167 |         filing_group.add_argument('-e', '--evernote', action='store_true',
168 |             default=False, dest='enable_evernote', help='Enable filing to Evernote')
169 |         filing_group.add_argument('-n', action='store_true',
170 |             default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder')
171 | 
172 | 
173 |         # Add flow option to single mode extract_images,preprocess,ocr,write
174 | 
175 |         args = p.parse_args(argv)
176 | 
177 |         self.debug = args.debug
178 |         self.verbose = args.verbose
179 |         self.pdf_filename = args.pdf_filename
180 |         self.lang = args.lang
181 |         self.watch_dir = args.watch_dir
182 |         self.enable_email = args.mail
183 |         self.match_using_filename = args.match_using_filename
184 | 
185 | 
186 |         # Deprecating skip_preprocess to make skipping the default (always true). Tesseract 3.04 is so much better now
187 |         # at handling non-ideal inputs and lines
188 |         if args.skip_preprocess:
189 |             print("Warning: --skip_preprocess is not needed anymore (defaults to skipping preprocessing).  If you want to enable preprocessing, use the new --preprocess option")
190 |         self.skip_preprocess = True
191 | 
192 |         if args.preprocess:
193 |             self.skip_preprocess = False
194 | 
195 |         if self.debug:
196 |             logging.basicConfig(level=logging.DEBUG, format='%(message)s')
197 | 
198 |         if self.verbose:
199 |             logging.basicConfig(level=logging.INFO, format='%(message)s')
200 | 
201 |         # Parse configuration file (YAML) if specified
202 |         if args.configfile:
203 |             self.config = self._get_config_file(args.configfile)
204 |             logging.debug("Read in configuration file")
205 |             logging.debug(self.config)
206 | 
207 |         if args.enable_evernote:
208 |             self.enable_evernote = True
209 |         else:
210 |             self.enable_evernote = False
211 | 
212 |         if args.enable_filing or args.enable_evernote:
213 |             self.enable_filing = True
214 |             if not args.configfile:
215 |                 p.error("Please specify a configuration file(CONFIGFILE) to enable filing")
216 |         else:
217 |             self.enable_filing = False
218 | 
219 |         self.watch = False
220 | 
221 |         if args.watch_dir:
222 |             logging.debug("Starting to watch")
223 |             self.watch = True
224 | 
225 |         if self.enable_email:
226 |             if not args.configfile:
227 |                 p.error("Please specify a configuration file(CONFIGFILE) to enable email")
228 | 
229 |     def _clean_up_files(self, files):
230 |         """
231 |             Helper function to delete files
232 |             :param files: List of files to delete
233 |             :type files: list
234 |             :returns: None
235 |         """
236 |         for f in files:
237 |             try:
238 |                 os.remove(f)
239 |             except:
240 |                 logging.debug("Error removing file %s .... continuing" % f)
241 | 
242 |             
243 | 
244 |     def _setup_filing(self):
245 |         """
246 |             Instance the proper PyFiler object (either
247 |             :class:`pypdfocr.pypdfocr_filer_dirs.PyFilerDirs` or
248 |             :class:`pypdfocr.pypdfocr_filer_evernote.PyFilerEvernote`)
249 | 
250 |             TODO: Make this more generic to allow third-party plugin filing objects
251 | 
252 |             :ivar filer: :class:`pypdfocr.pypdfocr_filer.PyFiler` PyFiler subclass object that is instantiated
253 |             :ivar pdf_filer: :class:`pypdfocr.pypdfocr_pdffiler.PyPdfFiler` object to help with PDF reading
254 |             :returns: Nothing
255 | 
256 |         """
257 |         # Look at self.config and create a self.pdf_filer object
258 | 
259 |         # --------------------------------------------------
260 |         # Some sanity checks
261 |         # --------------------------------------------------
262 |         assert(self.config and self.enable_filing)
263 |         for required in ['target_folder', 'default_folder']:
264 |             if not required in self.config:
265 |                 error ("%s must be specified in config file" % required)
266 |             else:
267 |                 # Make sure these required folders are in abspath format
268 |                 self.config[required] = os.path.abspath(self.config[required])
269 |         if 'original_move_folder' in self.config:
270 |             # User wants to move the original after filing
271 |             orig = 'original_move_folder'
272 |             self.config[orig] = os.path.abspath(self.config[orig])
273 |             if not os.path.exists(self.config[orig]):
274 |                 os.makedirs(self.config[orig])
275 |             original_move_folder = self.config[orig]
276 |         else:
277 |             original_move_folder = None
278 |         # --------------------------------------------------
279 |         # Start the filing object
280 |         # --------------------------------------------------
281 |         if self.enable_evernote:
282 |             self.filer = PyFilerEvernote(self.config['evernote_developer_token'])
283 |         else:
284 |             self.filer = PyFilerDirs()
285 |             
286 |         self.filer.target_folder = self.config['target_folder']
287 |         self.filer.default_folder = self.config['default_folder']
288 |         self.filer.original_move_folder = original_move_folder
289 | 
290 |         self.pdf_filer = PyPdfFiler(self.filer)
291 |         if self.match_using_filename:
292 |             print("Matching using filename as a fallback to pdf contents")
293 |             self.pdf_filer.file_using_filename = True
294 | 
295 |         # ------------------------------
296 |         # Add all the folder names with associated keywords
297 |         # to the filer object
298 |         # ------------------------------
299 |         keyword_count = 0
300 |         folder_count = 0
301 |         if 'folders' in self.config:
302 |             for folder, keywords in self.config['folders'].items():
303 |                 folder_count +=1
304 |                 keyword_count += len(keywords)
305 |                 # Make sure keywords are lower-cased before adding
306 |                 keywords = [str(x).lower() for x in keywords]
307 |                 self.filer.add_folder_target(folder, keywords)
308 | 
309 |         print ("Filing of PDFs is enabled")
310 |         print (" - %d target filing folders" % (folder_count))
311 |         print (" - %d keywords" % (keyword_count))
312 | 
313 |     
314 |     def _setup_external_tools(self):
315 |         """
316 |             Instantiate the external tool wrappers with their config dicts
317 |         """
318 | 
319 |         self.gs = PyGs(self.config.get('ghostscript',{}))
320 |         self.ts = PyTesseract(self.config.get('tesseract',{}))
321 |         self.pdf = PyPdf(self.gs)
322 |         self.preprocess = PyPreprocess(self.config.get('preprocess', {}))
323 | 
324 |         return
325 | 
326 |     def run_conversion(self, pdf_filename):
327 |         """
328 |             Does the following:
329 |             
330 |             - Convert the PDF using GhostScript to TIFF and JPG
331 |             - Run Tesseract on the TIFF to extract the text into HOCR (html)
332 |             - Use PDF generator to overlay the text on the JPG and output a new PDF
333 |             - Clean up temporary image files
334 |             
335 |             :param pdf_filename: Scanned PDF
336 |             :type pdf_filename: string
337 |             :returns: OCR'ed PDF
338 |             :rtype: filename string
339 |         """
340 |         print ("Starting conversion of %s" % pdf_filename)
341 |         try:
342 |             # Make the images for Tesseract
343 |             img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename)
344 | 
345 |             fns = glob.glob(glob_img_filename)
346 |         
347 |         except Exception:
348 |             raise
349 | 
350 |         try:
351 |             # Preprocess
352 |             if not self.skip_preprocess:
353 |                 preprocess_imagefilenames = self.preprocess.preprocess(fns)
354 |             else:
355 |                 logging.info("Skipping preprocess step")
356 |                 preprocess_imagefilenames = fns
357 |             # Run teserract
358 |             self.ts.lang = self.lang
359 |             hocr_filenames = self.ts.make_hocr_from_pnms(preprocess_imagefilenames)
360 |             
361 |             # Generate new pdf with overlayed text
362 |             #ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename)
363 |             ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename)
364 | 
365 |         finally:
366 |             # Clean up the files
367 |             time.sleep(1)
368 |             if not self.debug:
369 |                 # Need to clean up the original image files before preprocessing
370 |                 if locals().has_key("fns"): # Have to check if this was set before exception raised
371 |                     logging.info("Cleaning up %s" % fns)
372 |                     self._clean_up_files(fns)
373 | 
374 |                 if locals().has_key("preprocess_imagefilenames"):  # Have to check if this was set before exception raised
375 |                     logging.info("Cleaning up %s" % preprocess_imagefilenames)
376 |                     self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs
377 |                     for ext in [".hocr", ".html", ".txt"]:
378 |                         fns_to_remove = [os.path.splitext(fn)[0]+ext for fn in preprocess_imagefilenames]
379 |                         logging.info("Cleaning up %s" % fns_to_remove)
380 |                         self._clean_up_files(fns_to_remove) # splat the hocr_filenames as it is a list of pairs
381 |                     # clean up the hocr input (jpg) and output (html) files
382 |                     #self._clean_up_files(itertools.chain(*hocr_filenames)) # splat the hocr_filenames as it is a list of pairs
383 |                     # Seems like newer tessearct > 3.03 is now creating .txt files with the OCR text?/?
384 |                     #self._clean_up_files([x[1].replace(".hocr", ".txt") for x in hocr_filenames])
385 | 
386 | 
387 |         print ("Completed conversion successfully to %s" % ocr_pdf_filename)
388 |         return ocr_pdf_filename
389 | 
390 |     def file_converted_file(self, ocr_pdffilename, original_pdffilename):
391 |         """ move the converted filename to its destiantion directory.  Optionally also
392 |             moves the original PDF.
393 | 
394 |             :param ocr_pdffilename: Converted PDF file
395 |             :type ocr_pdffilename: filename string
396 |             :param original_pdffilename: Original scanned PDF file
397 |             :type original_pdffilename: filename string
398 |             :returns: Target folder name
399 |             "rtype: string
400 |         """
401 |         filed_path = self.pdf_filer.move_to_matching_folder(ocr_pdffilename)  
402 |         print("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path)))
403 | 
404 |         tgt_path = self.pdf_filer.file_original(original_pdffilename)
405 |         if tgt_path != original_pdffilename:
406 |             print("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path)))
407 |         return os.path.dirname(filed_path)
408 | 
409 |   
410 |     def _send_email(self, infilename, outfilename, filing ):
411 |         """
412 |             Send email using smtp
413 |         """
414 |         print("Sending email status")
415 |         from_addr = self.config["mail_from_addr"]
416 |         to_addr_list = self.config["mail_to_list"]
417 |         smtpserver = self.config["mail_smtp_server"]
418 |         login = self.config["mail_smtp_login"]
419 |         password = self.config["mail_smtp_password"]
420 | 
421 |         subject = "PyPDFOCR converted: %s" % (os.path.basename(outfilename))
422 |         header  = 'From: %s\n' % login
423 |         header += 'To: %s\n' % ','.join(to_addr_list)
424 |         header += 'Subject: %s\n\n' % subject
425 |         message = """
426 |         PyPDFOCR Conversion:
427 |         --------------------
428 |         Original file: %s
429 |         Converted file: %s
430 |         Filing: %s
431 |         """ % (infilename, outfilename, filing)
432 |         message = header + message
433 |       
434 |         server = smtplib.SMTP(smtpserver)
435 |         server.starttls()
436 |         server.login(login,password)
437 |         problems = server.sendmail(from_addr, to_addr_list, message)
438 |         server.quit()
439 | 
440 |     def go(self, argv):
441 |         """ 
442 |             The main entry point into PyPDFOCR
443 | 
444 |             #. Parses options
445 |             #. If filing is enabled, call :func:`_setup_filing`
446 |             #. If watch is enabled, start the watcher
447 |             #. :func:`run_conversion`
448 |             #. if filing is enabled, call :func:`file_converted_file`
449 |         """
450 |         # Read the command line options
451 |         self.get_options(argv)
452 | 
453 |         # Setup tesseract and ghostscript
454 |         self._setup_external_tools()
455 | 
456 |         # Setup the pdf filing if enabled
457 |         if self.enable_filing:
458 |             self._setup_filing()
459 | 
460 |         # Do the actual conversion followed by optional filing and email
461 |         if self.watch:
462 |             while True:  # Make sure the watcher doesn't terminate
463 |                 try:
464 |                     py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch'))
465 |                     for pdf_filename in py_watcher.start():
466 |                         self._convert_and_file_email(pdf_filename)
467 |                 except KeyboardInterrupt:
468 |                     break
469 |                 except Exception as e:
470 |                     print traceback.print_exc(e)
471 |                     py_watcher.stop()
472 |                     
473 |         else:
474 |             self._convert_and_file_email(self.pdf_filename)
475 | 
476 |     def _convert_and_file_email(self, pdf_filename):
477 |         """
478 |             Helper function to run the conversion, then do the optional filing, and optional emailing.
479 |         """
480 |         ocr_pdffilename = self.run_conversion(pdf_filename)
481 |         if self.enable_filing:
482 |             filing = self.file_converted_file(ocr_pdffilename, pdf_filename)
483 |         else:
484 |             filing = "None"
485 | 
486 |         if self.enable_email:
487 |             self._send_email(pdf_filename, ocr_pdffilename, filing)
488 | 
489 | def main(): # pragma: no cover 
490 |     multiprocessing.freeze_support()
491 |     script = PyPDFOCR()
492 |     script.go(sys.argv[1:])
493 | 
494 | if __name__ == '__main__':
495 |     main()
496 | 
497 | 
498 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr.spec:
--------------------------------------------------------------------------------
 1 | # -*- mode: python -*-
 2 | a = Analysis(['src\\pypdfocr.py'],
 3 |              pathex=['C:\\Users\\Virantha Ekanayake\\dev\\pypdfocr\\src'],
 4 | 		hiddenimports = [
 5 | 		    'reportlab.pdfbase._fontdata_enc_macexpert',
 6 | 		    'reportlab.pdfbase._fontdata_enc_macroman',
 7 | 		    'reportlab.pdfbase._fontdata_enc_pdfdoc',
 8 | 		    'reportlab.pdfbase._fontdata_enc_standard',
 9 | 		    'reportlab.pdfbase._fontdata_enc_symbol',
10 | 		    'reportlab.pdfbase._fontdata_enc_winansi',
11 | 		    'reportlab.pdfbase._fontdata_enc_zapfdingbats',
12 | 		    'reportlab.pdfbase._fontdata_widths_courier',
13 | 		    'reportlab.pdfbase._fontdata_widths_courierbold',
14 | 		    'reportlab.pdfbase._fontdata_widths_courierboldoblique',
15 | 		    'reportlab.pdfbase._fontdata_widths_courieroblique',
16 | 		    'reportlab.pdfbase._fontdata_widths_helvetica',
17 | 		    'reportlab.pdfbase._fontdata_widths_helveticabold',
18 | 		    'reportlab.pdfbase._fontdata_widths_helveticaboldoblique',
19 | 		    'reportlab.pdfbase._fontdata_widths_helveticaoblique',
20 | 		    'reportlab.pdfbase._fontdata_widths_symbol',
21 | 		    'reportlab.pdfbase._fontdata_widths_timesbold',
22 | 		    'reportlab.pdfbase._fontdata_widths_timesbolditalic',
23 | 		    'reportlab.pdfbase._fontdata_widths_timesitalic',
24 | 		    'reportlab.pdfbase._fontdata_widths_timesroman',
25 | 		    'reportlab.pdfbase._fontdata_widths_zapfdingbats'],
26 |              hookspath=None,
27 |              runtime_hooks=None)
28 | pyz = PYZ(a.pure)
29 | exe = EXE(pyz,
30 |           a.scripts,
31 |           a.binaries,
32 |           a.zipfiles,
33 |           a.datas,
34 |           name='pypdfocr.exe',
35 |           debug=False,
36 |           strip=None,
37 |           upx=True,
38 |           console=True )
39 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_filer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2013 Virantha Ekanayake All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import abc
 15 | import os, logging
 16 | 
 17 | class PyFiler(object):
 18 |     """ Abstract base class for defining filing objects, whether you want to 
 19 |     save to a file-system/directory structure or to something like Evernote
 20 | 
 21 |     """
 22 |     __metaclass__ = abc.ABCMeta
 23 | 
 24 |     @abc.abstractmethod
 25 |     def move_to_matching_folder(self, filename):
 26 |         """ Move the file given by filename to the proper location.
 27 |             You will need to use :py:attr:`target_folder` and :py:attr:`folder_targets`
 28 |             to figure out what the proper destination is.  If there is no matching location,
 29 |             then use :py:attr:`default_folder`
 30 | 
 31 |             :param filename: File to move
 32 |             :type filename: string
 33 |             :returns: Full path+filename of destination
 34 |             :rtype: string
 35 |         """
 36 | 
 37 |     @abc.abstractmethod
 38 |     def file_original(self, original_filename):
 39 |         """ Move the original file given by filename to the proper location.
 40 |             You will need to use :py:attr:`original_move_target`
 41 | 
 42 |             :param original_filename: File to move
 43 |             :type original_filename: string
 44 |             :returns: Full path+filename of destination(original_filename if not moved)
 45 |             :rtype: string
 46 |         """
 47 | 
 48 |     @abc.abstractmethod
 49 |     def add_folder_target(self, folder, keywords):
 50 |         """ Add a target folder for a list of keywords """
 51 | 
 52 |     def _get_unique_filename_by_appending_version_integer(self, tgtfilename):
 53 |         if os.path.exists(tgtfilename):
 54 |             logging.info("File %s already exists in target directory %s" % (os.path.basename(tgtfilename), os.path.dirname(tgtfilename)))
 55 |             # First, try appending a _v1 to it
 56 |             num = 1
 57 |             dr, fn, ext = self._split_filename_dir_filename_ext(tgtfilename)
 58 |             tgtfilename = os.path.join(dr, "%s_%d%s" % (fn, num, ext))
 59 |             while os.path.exists(tgtfilename):
 60 |                 # Add an incrementing integer to the end of the filename and Loop until we find a new filename
 61 |                 num += 1
 62 |                 tgtfilename = os.path.join(dr, "%s_%d%s" % (fn, num, ext))
 63 |                 logging.info("Trying %s" % tgtfilename)
 64 |             logging.info("Using name %s instead for copying to target directory %s" % (os.path.basename(tgtfilename),os.path.dirname(tgtfilename )))
 65 |         return tgtfilename
 66 | 
 67 |     def _split_filename_dir_filename_ext(self, filename):
 68 |         dr, fn = os.path.split(filename) # Get directory and filename
 69 | 
 70 |         # Silly me, forgot about the splitext function
 71 |         #fn_no_ext = fn.split('.')[0:-1] # Get the filename without ending extension
 72 |         #fn_no_ext = ''.join(fn_no_ext)
 73 |         #ext = fn.split('.')[-1]
 74 | 
 75 |         fn_no_ext, ext = os.path.splitext(fn)  # Get filename plus extension
 76 |         return dr, fn_no_ext, ext
 77 | 
 78 |     def get_target_folder(self):
 79 |         return self._target_folder
 80 |     def set_target_folder(self, target_folder):
 81 |         self._target_folder = target_folder
 82 | 
 83 |     def get_default_folder(self):
 84 |         return self._default_folder
 85 |     def set_default_folder(self, default_folder):
 86 |         self._default_folder = default_folder
 87 |     
 88 |     def get_original_move_folder(self):
 89 |         return self._original_move_folder
 90 |     def set_original_move_folder(self, original_move_folder):
 91 |         self._original_move_folder = original_move_folder
 92 | 
 93 |     def get_folder_targets(self):
 94 |         return self._folder_targets
 95 |     def set_folder_targets(self, folder_targets):
 96 |         self._folder_targets = folder_targets
 97 | 
 98 |     target_folder = property (get_target_folder, set_target_folder)
 99 |     default_folder = property (get_default_folder, set_default_folder)
100 |     original_move_folder = property(get_original_move_folder, set_original_move_folder)
101 |    
102 |     folder_targets = property(get_folder_targets, set_folder_targets)
103 |     """ Data structure for mapping a keyword to a folder target.  Usually just a dict, and new mappings
104 |         are added from :py:func:`add_folder_target` 
105 |     """
106 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_filer_dirs.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import logging
16 | import os
17 | import shutil
18 | 
19 | from pypdfocr_filer import PyFiler
20 | 
21 | """
22 |     Implementation of a filer class 
23 |         -> Works on file system/directory structure
24 | """
25 | class PyFilerDirs(PyFiler):
26 |     
27 |     def __init__(self):
28 |         self.target_folder = None
29 |         self.default_folder = None
30 |         self.original_move_folder = None
31 |         self.folder_targets = {}
32 | 
33 |     def add_folder_target(self, folder, keywords):
34 |         assert folder not in self.folder_targets, "Target folder already defined! (%s)" % (folder)
35 |         self.folder_targets[folder] = keywords
36 | 
37 |     def file_original(self, original_filename):
38 |         if not self.original_move_folder:
39 |             logging.debug("Leaving original untouched")
40 |             return original_filename
41 | 
42 |         tgt_path = self.original_move_folder
43 |         logging.debug("Moving original %s to %s" % (original_filename, tgt_path))
44 |         tgtfilename = os.path.join(tgt_path, os.path.basename(original_filename))
45 |         tgtfilename = self._get_unique_filename_by_appending_version_integer(tgtfilename)
46 | 
47 |         shutil.move(original_filename, tgtfilename)
48 |         return tgtfilename
49 | 
50 |     def move_to_matching_folder(self, filename, foldername):
51 |         assert self.target_folder != None
52 |         assert self.default_folder != None
53 | 
54 |         if not foldername:
55 |             logging.info("[DEFAULT] %s --> %s" % (filename, self.default_folder))
56 |             tgt_path = os.path.join(self.target_folder, self.default_folder)
57 |         else:   
58 |             logging.info("[MATCH] %s --> %s" % (filename, foldername))
59 |             tgt_path = os.path.join(self.target_folder,foldername)
60 | 
61 |         if not os.path.exists(tgt_path):
62 |             logging.debug("Making path %s" % tgt_path)
63 |             os.makedirs(tgt_path)
64 | 
65 |         logging.debug("Moving %s to %s" % (filename, tgt_path))
66 |         tgtfilename = os.path.join(tgt_path, os.path.basename(filename))
67 |         tgtfilename = self._get_unique_filename_by_appending_version_integer(tgtfilename)
68 | 
69 |         shutil.move(filename, tgtfilename)
70 |         return tgtfilename
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_filer_evernote.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | import logging
 16 | import os
 17 | import shutil
 18 | import hashlib
 19 | import time
 20 | import sys
 21 | 
 22 | from pypdfocr_filer import PyFiler
 23 | 
 24 | import functools
 25 | 
 26 | from evernote.api.client import EvernoteClient
 27 | import evernote.edam.type.ttypes as Types
 28 | import evernote.edam.userstore.constants as UserStoreConstants
 29 | from evernote.edam.error.ttypes import EDAMUserException
 30 | from evernote.edam.error.ttypes import EDAMSystemException
 31 | from evernote.edam.error.ttypes import EDAMNotFoundException
 32 | from evernote.edam.error.ttypes import EDAMErrorCode
 33 | 
 34 | 
 35 | """
 36 |     Implementation of a filer class 
 37 |         -> Files documents to Evernote notebooks (each document becomes a new note)
 38 | """
 39 | class en_handle(object):
 40 |     """ Generic exception handler for Evernote actions
 41 |     """
 42 |     def __init__(self, f):
 43 |         # f is the method being decorated, so save it so we can call it later!
 44 |         self.f = f
 45 |         functools.update_wrapper(self, f)
 46 | 
 47 |     def __get__(self, instance, owner):
 48 |         # Save a ptr to the object being decorated
 49 |         self.cls = owner
 50 |         self.obj = instance
 51 |         return self.__call__
 52 | 
 53 |     def __call__(self, *args, **kwargs):
 54 |         # The actual meat of the decorator
 55 | 
 56 |         # Call the original method being decorated
 57 |         retryCount = 3
 58 |         retry_auth = False
 59 |         msg = "EVERNOTE ERROR: %s"
 60 |         r = None
 61 |         while retryCount > 0:
 62 |             try: 
 63 |                 retryCount -= 1
 64 |                 if retry_auth:
 65 |                     logging.debug("Retrying")
 66 |                     self.obj._connect_to_evernote(self.obj.dictUserInfo)
 67 |                 retry_auth = False
 68 |                 logging.debug("executing user function")
 69 |                 r = self.f.__call__(self.obj, *args, **kwargs)
 70 |                 break
 71 |             except EDAMUserException as e:
 72 |                 err = e.errorCode
 73 |                 c = EDAMErrorCode
 74 |                 if err == c.AUTH_EXPIRED or err == c.DATA_REQUIRED:
 75 |                     logging.debug(msg % "Authorization expired, retrying...")
 76 |                     retry_auth = True
 77 |                     time.sleep(3)
 78 |                 else:
 79 |                     logging.debug(msg % ("Unhandled error %s:%s" % (c._VALUES_TO_NAMES[err], e.parameter)))
 80 |         return r
 81 | 
 82 | 
 83 | 
 84 | class PyFilerEvernote(PyFiler):
 85 |     
 86 |     def get_target_folder(self):
 87 |         return self._target_folder
 88 |     def set_target_folder (self, target_folder):
 89 |         """ Override this to make sure we only have the basename"""
 90 |         print("Setting target_folder %s" % target_folder)
 91 |         if target_folder:
 92 |             self._target_folder = os.path.basename(target_folder)
 93 |         else:
 94 |             self._target_folder = target_folder
 95 | 
 96 |     target_folder = property(get_target_folder, set_target_folder)
 97 | 
 98 |     def get_default_folder (self):
 99 |         """ Override this to make sure we only have the basename"""
100 |         return self._default_folder
101 | 
102 |     def set_default_folder (self, default_folder):
103 |         """ Override this to make sure we only have the basename"""
104 |         if default_folder:
105 |             self._default_folder = os.path.basename(default_folder)
106 |         else:
107 |             self._default_folder = default_folder
108 | 
109 |     default_folder = property(get_default_folder, set_default_folder)
110 | 
111 |     def __init__(self, dev_token):
112 |         self.target_folder = None
113 |         self.default_folder = None
114 |         self.original_move_folder = None
115 |         self.folder_targets = {}
116 |         self.dictUserInfo = { 'dev_token': dev_token }
117 |         self._connect_to_evernote(self.dictUserInfo)
118 | 
119 |     def _connect_to_evernote(self, dictUserInfo):
120 |         """
121 |             Establish a connection to evernote and authenticate.
122 | 
123 |             :param dictUserInfo: Dict of user info like user/passwrod.  For now, just the dev token
124 |             :returns success: Return wheter connection succeeded
125 |             :rtype bool:
126 |         """
127 |         print("Authenticating to Evernote")
128 |         dev_token = dictUserInfo['dev_token']
129 |         logging.debug("Authenticating using token %s" % dev_token)
130 |         user = None
131 |         try:
132 |             self.client = EvernoteClient(token=dev_token, sandbox=False)
133 |             self.user_store = self.client.get_user_store()
134 |             user = self.user_store.getUser()
135 |         except EDAMUserException as e:
136 |             err = e.errorCode
137 |             print("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.parameter))
138 |         except EDAMSystemException as e:
139 |             err = e.errorCode
140 |             print("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.message))
141 |             sys.exit(-1)
142 | 
143 |         if user:
144 |             print("Authenticated to evernote as user %s" % user.username)
145 |         return True
146 | 
147 |     def add_folder_target(self, folder, keywords):
148 |         assert folder not in self.folder_targets, "Target folder already defined! (%s)" % (folder)
149 |         self.folder_targets[folder] = keywords
150 | 
151 |     def file_original(self, original_filename):
152 |         """ 
153 |             Just file it to the local file system (don't upload to evernote)
154 |         """
155 |         if not self.original_move_folder:
156 |             logging.debug("Leaving original untouched")
157 |             return original_filename
158 | 
159 |         tgt_path = self.original_move_folder
160 |         logging.debug("Moving original %s to %s" % (original_filename, tgt_path))
161 |         tgtfilename = os.path.join(tgt_path, os.path.basename(original_filename))
162 |         tgtfilename = self._get_unique_filename_by_appending_version_integer(tgtfilename)
163 | 
164 |         shutil.move(original_filename, tgtfilename)
165 |         return tgtfilename
166 | 
167 |     @en_handle
168 |     def _get_notebooks(self):
169 |         note_store = self.client.get_note_store()
170 |         notebooks = note_store.listNotebooks()
171 |         return {n.name:n for n in notebooks}
172 | 
173 |     @en_handle
174 |     def _create_notebook(self, notebook):
175 |         note_store = self.client.get_note_store()
176 |         return note_store.createNotebook(notebook)
177 | 
178 |     def _update_notebook(self, notebook):
179 |         note_store = self.client.get_note_store()
180 |         note_store.updateNotebook(notebook)
181 |         return
182 | 
183 |     @en_handle
184 |     def _check_and_make_notebook(self, notebook_name):
185 |         """
186 |             Weird.
187 |             :returns notebook: New or existing notebook object
188 |             :rtype Types.Notebook:
189 |         """
190 |         # Get the noteStore
191 |         #note_store = self.client.get_note_store()
192 |         #notebooks = note_store.listNotebooks()
193 |         #notebooks = {n.name:n for n in notebooks}
194 |         notebooks = self._get_notebooks()
195 |         if notebook_name in notebooks:
196 |             notebook = notebooks[notebook_name]
197 |             if notebook.stack != self.target_folder:
198 |                 notebook.stack = self.target_folder
199 |                 self._update_notebook(notebook)
200 |             return notebook
201 |         else:
202 |             # Need to create a new notebook
203 |             notebook = Types.Notebook()
204 |             notebook.name = notebook_name
205 |             notebook.stack = self.target_folder
206 |             notebook = self._create_notebook(notebook)
207 |             #notebook = note_store.createNotebook(notebook)
208 |             return notebook
209 | 
210 |     @en_handle
211 |     def _create_evernote_note(self, notebook, filename):
212 |         # Create the new note
213 |         note = Types.Note()
214 |         note.title = os.path.basename(filename)
215 |         note.notebookGuid = notebook.guid
216 |         note.content = '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">'
217 |         note.content += '<en-note>Uploaded by PyPDFOCR <br/>'
218 |        
219 | 
220 |         logging.debug("Loading PDF")
221 |         md5 = hashlib.md5()
222 |         with open(filename,'rb') as f: 
223 |             pdf_bytes = f.read()
224 | 
225 |         logging.debug("Calculating md5 checksum of pdf")
226 |         md5.update(pdf_bytes)
227 |         md5hash = md5.hexdigest()
228 | 
229 |         logging.debug("Uploading note")
230 |         
231 |         # Create the Data type for evernote that goes into a resource
232 |         pdf_data = Types.Data()
233 |         pdf_data.bodyHash = md5hash
234 |         pdf_data.size = len(pdf_bytes) 
235 |         pdf_data.body = pdf_bytes
236 | 
237 |         # Add a link in the evernote boy for this content
238 |         link = '<en-media type="application/pdf" hash="%s"/>' % md5hash
239 |         logging.debug(link)
240 |         note.content += link
241 |         note.content += '</en-note>'
242 |         
243 |         resource_list = []
244 |         pdf_resource = Types.Resource()
245 |         pdf_resource.data = pdf_data
246 |         pdf_resource.mime = "application/pdf"
247 |         # TODO: Enable filename
248 |         # Make a attributes for this resource
249 |         pdf_resource.attributes = Types.ResourceAttributes()
250 |         pdf_resource.attributes.fileName = os.path.basename(filename)
251 |         resource_list.append(pdf_resource)
252 | 
253 |         note.resources = resource_list
254 | 
255 |         return note
256 | 
257 |         
258 |     def move_to_matching_folder(self, filename, foldername):
259 |         """
260 |             Use the evernote API to create a new note:
261 | 
262 |             #. Make the notebook if it doesn't exist (:func:`_check_and_make_notebook`)
263 |             #. Create the note (:func:`_create_evernote_note`)
264 |             #. Upload note using API
265 | 
266 |         """
267 |         assert self.target_folder != None
268 |         assert self.default_folder != None
269 | 
270 |         if not foldername:
271 |             logging.info("[DEFAULT] %s --> %s" % (filename, self.default_folder))
272 |             foldername = self.default_folder
273 |         else:   
274 |             logging.info("[MATCH] %s --> %s" % (filename, foldername))
275 | 
276 |         # Check if the evernote notebook exists
277 |         print ("Checking for notebook named %s" % foldername)
278 |         notebook = self._check_and_make_notebook(foldername)
279 |         print("Uploading %s to %s" % (filename, foldername))
280 |         
281 |         note = self._create_evernote_note(notebook, filename)
282 | 
283 |         # Store the note in evernote
284 |         note_store = self.client.get_note_store()
285 |         note = note_store.createNote(note)
286 |         os.remove(filename)
287 | 
288 |         return "%s/%s" % (notebook.name, note.title)
289 | 
290 | 
291 | if __name__ == '__main__': # pragma: no cover
292 |     logging.basicConfig(level=logging.DEBUG, format='%(message)s')
293 |     logging.basicConfig(level=logging.INFO, format='%(message)s')
294 |     p = PyFilerEvernote()
295 |     p.add_folder_target("auto", ['dmv'])
296 |     p.target_folder = 'myuploads'
297 |     p.default_folder = 'default'
298 |     p.original_move_folder = None
299 | 
300 |     p.move_to_matching_folder('../dmv/dmv_ocr.pdf', 'auto')
301 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_gs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | # Copyright 2013 Virantha Ekanayake All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | 
 19 | """
 20 |     Wrap ghostscript calls.  Yes, this is ugly.
 21 | """
 22 | 
 23 | import subprocess
 24 | import sys, os
 25 | import logging
 26 | import glob
 27 | 
 28 | def error(text):
 29 |     print("ERROR: %s" % text)
 30 |     exit(-1)
 31 | 
 32 | class PyGs(object):
 33 |     """Class to wrap all the ghostscript calls"""
 34 | 
 35 |     def __init__(self, config):
 36 |         self.msgs = {
 37 |                 'GS_FAILED': 'Ghostscript execution failed',
 38 |                 'GS_MISSING_PDF': 'Cannot find specified pdf file',
 39 |                 'GS_OUTDATED': 'Your Ghostscript version is probably out of date.  Please upgrade to the latest version',
 40 |                 'GS_MISSING_BINARY': 'Could not find Ghostscript in the usual place; please specify it using your config file',
 41 |             }
 42 |         self.threads = config.get('threads',4)
 43 | 
 44 |         if "binary" in config:  # Override location of binary
 45 |             binary = config['binary']
 46 |             if os.name == 'nt':
 47 |                 binary = '"%s"' % binary
 48 |                 binary = binary.replace("\\", "\\\\")
 49 |             logging.info("Setting location for executable to %s" % (binary))
 50 |         else:
 51 |             if str(os.name) == 'nt':
 52 |                 win_binary = self._find_windows_gs()
 53 |                 binary = '"%s"' % win_binary
 54 |                 logging.info("Using Ghostscript: %s" % binary)
 55 |             else:
 56 |                 binary = "gs"
 57 |         self.binary = binary
 58 | 
 59 |         #self.tiff_dpi = 300
 60 |         self.output_dpi = 300
 61 |         self.greyscale = True
 62 |         # Tiff is used for the ocr, so just fix it at 300dpi
 63 |         #  The other formats will be used to create the final OCR'ed image, so determine
 64 |         #  the DPI by using pdfimages if available, o/w default to 200
 65 |         self.gs_options = {'tiff': ['tiff', ['-sDEVICE=tiff24nc','-r%(dpi)s' ]],
 66 |                             'jpg': ['jpg', ['-sDEVICE=jpeg','-dJPEGQ=75', '-r%(dpi)s']],
 67 |                             'jpggrey': ['jpg', ['-sDEVICE=jpeggray', '-dJPEGQ=75', '-r%(dpi)s']],
 68 |                             'png': ['png', ['-sDEVICE=png16m', '-r%(dpi)s']],
 69 |                             'pnggrey': ['png', ['-sDEVICE=pngmono', '-r%(dpi)s']],
 70 |                             'tifflzw': ['tiff', ['-sDEVICE=tifflzw', '-r%(dpi)s']],
 71 |                             'tiffg4': ['tiff', ['-sDEVICE=tiffg4', '-r%(dpi)s']],
 72 |                             'pnm': ['pnm', ['-sDEVICE=pnmraw', '-r%(dpi)s']],
 73 |                             'pgm': ['pgm', ['-sDEVICE=pgm', '-r%(dpi)s']],
 74 |                         }
 75 | 
 76 |     def _find_windows_gs(self):
 77 |         """
 78 |             Searches through the Windows program files directories to find Ghostscript.
 79 |             If it finds multiple versions, it does a naive sort for now to find the most
 80 |             recent.
 81 | 
 82 |             :rval: The ghostscript binary location
 83 | 
 84 |         """
 85 |         windirs = ["c:\\Program Files\\gs", "c:\\Program Files (x86)\\gs"]
 86 |         gs = None
 87 |         for d in windirs:
 88 |             if not os.path.exists(d):
 89 |                 continue
 90 |             cwd = os.getcwd()
 91 |             os.chdir(d)
 92 |             listing = os.listdir('.')
 93 | 
 94 |             # Find all possible gs* sub-directories
 95 | 	    listing = [x for x in listing if x.startswith('gs')]
 96 | 
 97 |             # TODO: Make this a natural sort
 98 |             listing.sort(reverse=True)
 99 | 	    for bindir in listing:
100 | 		binpath = os.path.join(bindir,'bin')
101 | 		if not os.path.exists(binpath): continue
102 | 		os.chdir(binpath)
103 |                 # Look for gswin64c.exe or gswin32c.exe (the c is for the command-line version)
104 | 		gswin = glob.glob('gswin*c.exe')
105 | 		if len(gswin) == 0:
106 | 		    continue
107 | 		gs = os.path.abspath(gswin[0]) # Just use the first found .exe (Do i need to do anything more complicated here?)
108 | 		os.chdir(cwd)
109 | 		return gs
110 | 
111 |         if not gs:
112 |             error(self.msgs['GS_MISSING_BINARY'])
113 | 
114 |     def _warn(self, msg):
115 |         print("WARNING: %s" % msg)
116 | 
117 |     def _get_dpi(self, pdf_filename):
118 |         if not os.path.exists(pdf_filename):
119 |             error(self.msgs['GS_MISSING_PDF'] + " %s" % pdf_filename)
120 | 
121 |         cmd = 'pdfimages -list "%s"' % pdf_filename
122 |         logging.info("Running pdfimages to figure out DPI...")
123 |         logging.debug(cmd)
124 |         try:
125 |             out = subprocess.check_output(cmd, shell=True)
126 |         except subprocess.CalledProcessError as e:
127 |             self._warn ("Could not execute pdfimages to calculate DPI (try installing xpdf or poppler?), so defaulting to %sdpi" % self.output_dpi) 
128 |             return
129 | 
130 |         # Need the second line of output
131 |         # Make sure it exists (in case this is an empty pdf)
132 |         results = out.splitlines()
133 |         if len(results)<3:
134 |             self._warn("Empty pdf, cannot determine dpi using pdfimages")
135 |             return
136 |         results = results[2]
137 |         logging.debug(results)
138 |         results = results.split()
139 |         if(results[2] != 'image'):
140 |             self._warn("Could not understand output of pdfimages, please rerun with -d option and file an issue at http://github.com/virantha/pypdfocr/issues") 
141 |             return
142 |         x_pt, y_pt, greyscale = int(results[3]), int(results[4]), results[5]=='gray'
143 |         self.greyscale = greyscale
144 | 
145 |         # Now, run imagemagick identify to get pdf width/height/density
146 |         cmd = 'identify -format "%%w %%x %%h %%y\n" "%s"' % pdf_filename
147 |         try:
148 |             out = subprocess.check_output(cmd, shell=True)
149 |             results = out.splitlines()[0]
150 |             results = results.replace("Undefined", "")
151 |             width, xdensity, height, ydensity = [float(x) for x in results.split()]
152 |             xdpi = round(x_pt/width*xdensity)
153 |             ydpi = round(y_pt/height*ydensity)
154 |             self.output_dpi = xdpi
155 |             if ydpi>xdpi: self.output_dpi = ydpi
156 |             if self.output_dpi < 300: self.output_dpi = 300
157 |             if abs(xdpi-ydpi) > xdpi*.05:  # Make sure the two dpi's are within 5%
158 |                 self._warn("X-dpi is %d, Y-dpi is %d, defaulting to %d" % (xdpi, ydpi, self.output_dpi))
159 |             else:
160 |                 print("Using %d DPI" % self.output_dpi)
161 | 
162 | 
163 |         except Exception as e:
164 |             logging.debug(str(e))
165 |             self._warn ("Could not execute identify to calculate DPI (try installing imagemagick?), so defaulting to %sdpi" % self.output_dpi) 
166 |         return
167 | 
168 | 
169 | 
170 |     def _run_gs(self, options, output_filename, pdf_filename):
171 |         try:
172 |             cmd = '%s -q -dNOPAUSE %s -sOutputFile="%s" "%s" -c quit' % (self.binary, options, output_filename, pdf_filename)
173 |             logging.info(cmd)        
174 |             out = subprocess.check_output(cmd, shell=True)
175 | 
176 |         except subprocess.CalledProcessError as e:
177 |             print e.output
178 |             if "undefined in .getdeviceparams" in e.output:
179 |                 error(self.msgs['GS_OUTDATED'])
180 |             else:
181 |                 error (self.msgs['GS_FAILED'])
182 | 
183 | 
184 |     def make_img_from_pdf(self, pdf_filename):
185 |         self._get_dpi(pdf_filename) # No need to bother anymore
186 | 
187 |         if not os.path.exists(pdf_filename):
188 |             error(self.msgs['GS_MISSING_PDF'] + " %s" % pdf_filename)
189 | 
190 |         filename, filext = os.path.splitext(pdf_filename)
191 | 
192 | 
193 |         # Create ancillary jpeg files to use later to calculate image dpi etc
194 |         #   We no longer use these for the final image. Instead the text is merged
195 |         #   directly with the original PDF.  Yay!
196 |         if self.greyscale:
197 |             self.img_format = 'jpggrey'
198 |             #self.img_format = 'pnggrey'
199 |             logging.info("Detected greyscale")
200 |         else:
201 |             self.img_format = 'jpg'
202 |             #self.img_format = 'png'
203 |             logging.info("Detected color")
204 | 
205 |         self.img_file_ext = self.gs_options[self.img_format][0]
206 | 
207 |         # The possible output files glob
208 |         globable_filename = '%s_*.%s' % (filename, self.img_file_ext)
209 |         # Delete any img files already existing
210 |         for fn in glob.glob(globable_filename):
211 |             os.remove(fn)
212 | 
213 |         options = ' '.join(self.gs_options[self.img_format][1]) % {'dpi':self.output_dpi}
214 |         output_filename = '%s_%%d.%s' % (filename, self.img_file_ext)
215 |         self._run_gs(options, output_filename, pdf_filename)
216 |         for fn in glob.glob(globable_filename):
217 |             logging.info("Created image %s" % fn)
218 |         return (self.output_dpi, globable_filename)
219 | 
220 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_interrupts.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright 2015 Virantha Ekanayake All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import signal, logging
17 | 
18 | """
19 |     Used for handling keyboard interrupts in Pools.
20 |         Basically, throw an Exception when we see the ctrl-c, so that it actaully is propagated to the parent class
21 | """
22 | 
23 | class KeyboardInterruptError(Exception): pass
24 | 
25 | def signal_handle(_signal, frame):
26 |     logging.debug("Stopping job")
27 |     raise KeyboardInterruptError()
28 | 
29 | 
30 | def init_worker():
31 |     """ used for catching ctrl-c
32 |     """
33 |     signal.signal(signal.SIGINT, signal_handle)
34 | 
35 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_multiprocessing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.7
 2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import sys, os, multiprocessing.forking
17 | import logging
18 | 
19 | """ Special work-around to support multiprocessing and pyinstaller --onefile on windows systms
20 | 
21 |     https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Multiprocessing
22 | """
23 | 
24 | import multiprocessing.forking as forking
25 | import os
26 | import sys
27 | 
28 | class _Popen(multiprocessing.forking.Popen):
29 |     def __init__(self, *args, **kw):
30 |         if hasattr(sys, 'frozen'):
31 |             # We have to set original _MEIPASS2 value from sys._MEIPASS
32 |             # to get --onefile mode working.
33 |             os.putenv('_MEIPASS2', sys._MEIPASS)
34 |         try:
35 |             super(_Popen, self).__init__(*args, **kw)
36 |         finally:
37 |             if hasattr(sys, 'frozen'):
38 |                 # On some platforms (e.g. AIX) 'os.unsetenv()' is not
39 |                 # available. In those cases we cannot delete the variable
40 |                 # but only set it to the empty string. The bootloader
41 |                 # can handle this case.
42 |                 if hasattr(os, 'unsetenv'):
43 |                     os.unsetenv('_MEIPASS2')
44 |                 else:
45 |                     os.putenv('_MEIPASS2', '')
46 | 
47 | forking.Popen = _Popen
48 | 
49 | #class Process(multiprocessing.Process):
50 |     #_Popen = _Popen
51 | 
52 | # ...
53 | 
54 | if __name__ == '__main__':
55 |     # On Windows calling this function is necessary.
56 |     multiprocessing.freeze_support()
57 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_pdf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | 
 17 | # Following code is adapted and modified from hocr-pdf.py released under
 18 | # Apache License, Version 2.0 available at 
 19 | # https://code.google.com/p/hocr-tools/source/browse/hocr-pdf
 20 | #   - Code was improved to allow multi-page hocr files
 21 | """
 22 |     Wrap pdf generation and text addition code
 23 | """
 24 | 
 25 | from optparse import OptionParser
 26 | import sys, os
 27 | import re
 28 | import logging
 29 | import shutil
 30 | import time
 31 | import tempfile
 32 | import glob
 33 | 
 34 | import cStringIO
 35 | import base64
 36 | import zlib
 37 | import math
 38 | 
 39 | from cgi import escape
 40 | # Pkg to read multiple image tiffs
 41 | from PIL import Image
 42 | from reportlab.pdfgen.canvas import Canvas
 43 | from reportlab.pdfbase import pdfmetrics
 44 | from reportlab.pdfbase.ttfonts import TTFont
 45 | from xml.etree.ElementTree import ElementTree, ParseError
 46 | import xml.etree
 47 | 
 48 | # Import Pypdf2
 49 | from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter, utils
 50 | 
 51 | from reportlab.lib.styles import getSampleStyleSheet
 52 | from reportlab.lib.enums import TA_LEFT
 53 | from reportlab.platypus.paragraph import Paragraph
 54 | 
 55 | from pypdfocr_util import Retry
 56 | from functools import partial
 57 | 
 58 | class RotatedPara(Paragraph):
 59 |     """
 60 |         Used for rotating text, since the low-level rotate method in textobject's don't seem to 
 61 |         do anything
 62 |     """
 63 | 
 64 |     def __init__ (self, text, style, angle):
 65 |         Paragraph.__init__(self, text, style)
 66 |         self.angle = angle
 67 | 
 68 |     def draw(self):
 69 |         self.canv.saveState()
 70 |         self.canv.translate(0,0)
 71 |         self.canv.rotate(self.angle)
 72 |         Paragraph.draw(self)
 73 |         self.canv.restoreState()
 74 |     def beginText(self, x, y):
 75 |         t = self.canv.beginText(x,y)
 76 |         t.setTextRenderMode(3)  # Set to zero if you want the text to appear
 77 |         #t.setTextRenderMode(0)  # Set to zero if you want the text to appear
 78 |         return t
 79 | 
 80 | class PyPdf(object):
 81 |     """Class to create pdfs from images"""
 82 |     # Some regexes to compile once
 83 |     regex_bbox = re.compile('bbox((\s+\d+){4})')
 84 |     regex_baseline = re.compile('baseline((\s+[\d\.\-]+){2})')
 85 |     regex_fontspec = re.compile('x_font\s+(.+);\s+x_fsize\s+(\d+)')
 86 |     regex_textangle = re.compile('textangle\s+(\d+)')
 87 | 
 88 |     def __init__(self, gs):
 89 |         self.gs = gs # Pointer to ghostscript object
 90 | 
 91 | 
 92 |     def get_transform(self, rotation, tx, ty):
 93 |         # Code taken from here:
 94 |         # http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824
 95 |         # Unclear why PyPDF2 builtin page rotation functions don't work
 96 |         translation = [[1, 0, 0],
 97 |                        [0, 1, 0],
 98 |                        [-tx,-ty,1]]
 99 |         rotation = math.radians(rotation)
100 |         rotating = [[math.cos(rotation), math.sin(rotation),0],
101 |                     [-math.sin(rotation),math.cos(rotation), 0],
102 |                     [0,                  0,                  1]]
103 |         rtranslation = [[1, 0, 0],
104 |                        [0, 1, 0],
105 |                        [tx,ty,1]]
106 |         ctm = utils.matrixMultiply(translation, rotating)
107 |         ctm = utils.matrixMultiply(ctm, rtranslation)
108 | 
109 |         return ctm[0][0], ctm[0][1], ctm[1][0], ctm[1][1], ctm[2][0], ctm[2][1]
110 | 
111 |     def mergeRotateAroundPointPage(self,page, page2, rotation, tx, ty):
112 |         # Code taken from here:
113 |         # http://stackoverflow.com/questions/6041244/how-to-merge-two-landscape-pdf-pages-using-pypdf/17392824#17392824
114 |         # Unclear why PyPDF2 builtin page rotation functions don't work
115 |         translation = [[1, 0, 0],
116 |                        [0, 1, 0],
117 |                        [-tx,-ty,1]]
118 |         rotation = math.radians(rotation)
119 |         rotating = [[math.cos(rotation), math.sin(rotation),0],
120 |                     [-math.sin(rotation),math.cos(rotation), 0],
121 |                     [0,                  0,                  1]]
122 |         rtranslation = [[1, 0, 0],
123 |                        [0, 1, 0],
124 |                        [tx,ty,1]]
125 |         ctm = utils.matrixMultiply(translation, rotating)
126 |         ctm = utils.matrixMultiply(ctm, rtranslation)
127 | 
128 |         return page.mergeTransformedPage(page2, [ctm[0][0], ctm[0][1],
129 |                                                  ctm[1][0], ctm[1][1],
130 |                                                  ctm[2][0], ctm[2][1]])
131 | 
132 |     def overlay_hocr_pages(self, dpi, hocr_filenames, orig_pdf_filename):
133 |         
134 |         logging.debug("Going to overlay following files onto %s" % orig_pdf_filename)
135 |         # Sort the hocr_filenames into natural keys!
136 |         hocr_filenames.sort(key=lambda x: self.natural_keys(x[0] ))
137 |         logging.debug(hocr_filenames)
138 | 
139 |         pdf_dir, pdf_basename = os.path.split(orig_pdf_filename)
140 |         basename = os.path.splitext(pdf_basename)[0]
141 |         pdf_filename = os.path.join(pdf_dir, "%s_ocr.pdf" % (basename))
142 | 
143 |         text_pdf_filenames = []
144 |         for img_filename, hocr_filename in hocr_filenames:
145 |             text_pdf_filename = self.overlay_hocr_page(dpi, hocr_filename, img_filename)
146 |             logging.info("Created temp OCR'ed pdf containing only the text as %s" % (text_pdf_filename))
147 |             text_pdf_filenames.append(text_pdf_filename)
148 | 
149 |         # Now, concatenate this text_pdfs into one single file.
150 |         # This is a hack to save memory/running time when we have to do the actual merge with a writer
151 | 
152 |         all_text_filename = os.path.join(pdf_dir, "%s_text.pdf" % (basename))
153 |         merger = PdfFileMerger()
154 |         for text_pdf_filename in text_pdf_filenames:
155 |             merger.append(PdfFileReader(file(text_pdf_filename, 'rb')))
156 |         merger.write(all_text_filename)
157 |         merger.close()
158 | 	del merger
159 | 
160 | 
161 |         writer = PdfFileWriter()
162 |         orig = open(orig_pdf_filename, 'rb')
163 |         text_file = open(all_text_filename, 'rb')
164 | 
165 |         for orig_pg, text_pg in zip(self.iter_pdf_page(orig), self.iter_pdf_page(text_file)):
166 |             orig_pg = self._get_merged_single_page(orig_pg, text_pg)
167 |             writer.addPage(orig_pg)
168 | 
169 |         with open(pdf_filename, 'wb') as f:
170 |             # Flush out this page merge so we can close the text_file
171 |             writer.write(f)
172 | 
173 |         orig.close()
174 |         text_file.close()
175 | 
176 |         # Windows sometimes locks the temp text file for no reason, so we need to retry a few times to delete
177 |         for fn in text_pdf_filenames:
178 |             #os.remove(fn)
179 |             Retry(partial(os.remove, fn), tries=10, pause=3).call_with_retry() 
180 | 
181 |         os.remove(all_text_filename)
182 |         logging.info("Created OCR'ed pdf as %s" % (pdf_filename))
183 | 
184 |         return pdf_filename
185 | 
186 |     def _get_merged_single_page(self, original_page, ocr_text_page):
187 |         """
188 |             Take two page objects, rotate the text page if necessary, and return the merged page
189 |         """
190 |         orig_rotation_angle = int(original_page.get('/Rotate', 0))
191 | 
192 |         if orig_rotation_angle != 0:
193 |             logging.info("Original Rotation: %s" % orig_rotation_angle)
194 |             self.mergeRotateAroundPointPage(original_page, ocr_text_page, orig_rotation_angle, ocr_text_page.mediaBox.getWidth()/2, ocr_text_page.mediaBox.getWidth()/2)
195 |             # None of these commands worked for me:
196 |             #orig_pg.rotateCounterClockwise(orig_rotation_angle)
197 |             #orig_pg.mergeRotatedPage(text_pg,orig_rotation_angle)
198 |         else:
199 |             original_page.mergePage(ocr_text_page)
200 |         original_page.compressContentStreams()
201 |         return original_page
202 | 
203 | 
204 |     def _get_img_dims(self, img_filename):
205 |         """
206 |             :rval: (width, height, dpi)
207 |         """
208 |         img = Image.open(img_filename)
209 |         w,h = img.size
210 |         dpi = img.info['dpi']
211 |         width = w*72.0/dpi[0]
212 |         height = h*72.0/dpi[1]
213 |         del img
214 |         return (width, height, dpi)
215 | 
216 |     def overlay_hocr_page(self, dpi, hocr_filename, img_filename):
217 |         hocr_dir, hocr_basename = os.path.split(hocr_filename)
218 |         img_dir, img_basename = os.path.split(img_filename)
219 |         logging.debug("hocr_filename:%s, hocr_dir:%s, hocr_basename:%s" % (hocr_filename, hocr_dir, hocr_basename))
220 |         assert(img_dir == hocr_dir)
221 | 
222 |         #basename = hocr_basename.split('.')[0]
223 |         basename = os.path.splitext(hocr_basename)[0]
224 |         pdf_filename = os.path.join("text_%s_ocr.pdf" % (basename))
225 | 
226 |         # Switch to the hocr directory to make this easier
227 |         cwd = os.getcwd()
228 |         if hocr_dir != "":
229 |             os.chdir(hocr_dir)
230 | 
231 |         with open(pdf_filename, "wb") as f:
232 |             logging.info("Overlaying hocr and creating text pdf %s" % pdf_filename)
233 |             pdf = Canvas(f, pageCompression=1)
234 |             pdf.setCreator('pypdfocr')
235 |             pdf.setTitle(os.path.basename(hocr_filename))
236 |             pdf.setPageCompression(1)
237 | 
238 |             width, height, dpi_jpg = self._get_img_dims(img_basename)
239 |             pdf.setPageSize((width,height))
240 |             logging.info("Page width=%f, height=%f" % (width, height))
241 | 
242 |             pg_num = 1
243 | 
244 |             logging.info("Adding text to page %s" % pdf_filename)
245 |             self.add_text_layer(pdf,hocr_basename,pg_num,height,dpi)
246 |             pdf.showPage()
247 |             pdf.save()
248 | 
249 |         os.chdir(cwd)
250 |         return os.path.join(hocr_dir, pdf_filename)
251 | 
252 |     def iter_pdf_page(self, f):
253 |         reader = PdfFileReader(f)
254 |         for pgnum in range(reader.getNumPages()):
255 |             pg = reader.getPage(pgnum)
256 |             yield pg
257 | 
258 |     def _atoi(self,text):
259 |         return int(text) if text.isdigit() else text
260 | 
261 |     def natural_keys(self, text):
262 |         '''
263 |         alist.sort(key=natural_keys) sorts in human order
264 |         http://nedbatchelder.com/blog/200712/human_sorting.html
265 |         (See Toothy's implementation in the comments)
266 |         '''
267 |         return [ self._atoi(c) for c in re.split('(\d+)', text) ]
268 | 
269 |     def add_text_layer(self,pdf, hocrfile, page_num,height, dpi):
270 |       """Draw an invisible text layer for OCR data.
271 | 
272 |         This function really needs to get cleaned up
273 |         
274 |       """
275 |       hocr = ElementTree()
276 |       try: 
277 |         # It's possible tesseract has failed and written garbage to this hocr file, so we need to catch any exceptions
278 |           hocr.parse(hocrfile)
279 |       except Exception:
280 |           logging.info("Error loading hocr, not adding any text")
281 |           return 
282 | 
283 |       logging.debug(xml.etree.ElementTree.tostring(hocr.getroot()))
284 |       for c in hocr.getroot():  # Find the <body> tag
285 |           if c.tag != 'body':
286 |               continue
287 |       for page in c: # Each child in the body is a page tag
288 |           if (page.attrib['class'] != "ocr_page"):
289 |               assert ("Why is this hocr not paging properly??")
290 |           if page.attrib['id'] == 'page_%d' %(page_num):
291 |               break
292 | 
293 |       for line in page.findall(".//{http://www.w3.org/1999/xhtml}span"):
294 |       #for line in page.findall(".//span"):
295 |         if line.attrib['class'] != 'ocr_line':
296 |           continue
297 |         linebox = self.regex_bbox.search(line.attrib['title']).group(1).split()
298 |         textangle = self.regex_textangle.search(line.attrib['title'])
299 |         if textangle:
300 |             textangle = self._atoi(textangle.group(1))
301 |         else:
302 |             textangle = 0
303 | 
304 |         try:
305 |           baseline = self.regex_baseline.search(line.attrib['title']).group(1).split()
306 |         except AttributeError:
307 |           baseline = [ 0, 0 ]
308 | 
309 |         linebox = [float(i) for i in linebox]
310 |         baseline = [float(i) for i in baseline]
311 | 
312 |         for word in line:
313 |           if word.attrib['class'] != 'ocrx_word':
314 |             continue
315 |           word_text = []
316 |           for child in word.iter():
317 |               if child.text:
318 |                   word_text.append(child.text)
319 |           word.text = ' '.join(word_text)
320 |           if word.text is None:
321 |             continue
322 |           logging.debug("word: %s, angle: %d" % ( word.text.strip(), textangle))
323 | 
324 | 
325 |           box = self.regex_bbox.search(word.attrib['title']).group(1).split()
326 |           #b = self.polyval(baseline, (box[0] + box[2]) / 2 - linebox[0]) + linebox[3]
327 |           box = [float(i) for i in box]
328 | 
329 |           # Transform angle to x,y co-ords needed for proper text placement
330 |           # We only support 0, 90, 180, 270!.  Anything else, we'll just use the normal orientation for now
331 | 
332 |           coords = { 0: (box[0], box[1]),
333 |                     90: (box[0], box[3]),  # facing right
334 |                     180: (box[2], box[3]), # upside down
335 |                     270: (box[2], box[1]), # facing left
336 |                     }
337 |           x,y = coords.get(textangle, (box[0], box[1]))
338 | 
339 |           style = getSampleStyleSheet()
340 |           normal = style["BodyText"]
341 |           normal.alignment = TA_LEFT
342 |           normal.leading = 0
343 |           font_name, font_size = self._get_font_spec(word.attrib['title'])
344 |           normal.fontName = "Helvetica"
345 |           normal.fontSize = font_size
346 | 
347 |           para = RotatedPara(escape(word.text.strip()), normal, textangle)
348 |           para.wrapOn(pdf, para.minWidth(), 100)  # Not sure what to use as the height  here
349 |           para.drawOn(pdf, x*72/dpi, height - y*72/dpi)
350 | 
351 | 
352 | 
353 |     def polyval(self,poly, x):
354 |       return x * poly[0] + poly[1]
355 | 
356 | 
357 |     def _get_font_spec(self, tag):
358 |         try:
359 |             fontspec = self.regex_fontspec.search(tag).groups()
360 |             fontname, fontsize = fontspec
361 |         except Exception:
362 |             fontname = ""
363 |             fontsize = "8"
364 |         return (fontname, self._atoi(fontsize))
365 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_pdffiler.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright 2013 Virantha Ekanayake All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """
17 |     Provides capability to search PDFs and file to a specific folder based
18 |     on keywords
19 | """
20 | 
21 | from sets import Set    
22 | import sys, os
23 | import re
24 | import logging
25 | import shutil
26 | 
27 | from PyPDF2 import PdfFileReader
28 | from pypdfocr_filer import PyFiler
29 | from pypdfocr_filer_dirs import PyFilerDirs
30 | 
31 | class PyPdfFiler(object):
32 |     def __init__(self, filer):
33 | 
34 |         assert isinstance(filer, PyFiler)
35 |         self.filer = filer  # Must be a subclass of PyFiler
36 | 
37 |         # Whether to fall back on filename for matching keywords against
38 |         # if there is no match in the text
39 |         self.file_using_filename = False 
40 | 
41 |     def iter_pdf_page_text(self, filename):
42 |         self.filename = filename
43 |         reader = PdfFileReader(filename)
44 |         logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
45 |         for pgnum in range(reader.getNumPages()):
46 |             text = reader.getPage(pgnum).extractText()
47 |             text = text.encode('ascii', 'ignore')
48 |             text = text.replace('\n', ' ')
49 |             yield text
50 | 
51 |     def _get_matching_folder(self, pdfText):
52 |         searchText = pdfText.lower()
53 |         for folder,strings in self.filer.folder_targets.items():
54 |             for s in strings:
55 |                 logging.debug("Checking string %s" % s)
56 |                 if s in searchText:
57 |                     logging.info("Matched keyword '%s'" % s)
58 |                     return folder
59 |         # No match found, so return 
60 |         return None
61 | 
62 |     def file_original (self, original_filename):
63 |         return self.filer.file_original(original_filename)
64 | 
65 |     def move_to_matching_folder(self, filename):
66 |         for page_text in self.iter_pdf_page_text(filename):
67 |             tgt_folder = self._get_matching_folder(page_text)
68 |             if tgt_folder: break  # Stop searching through pdf pages as soon as we find a match
69 | 
70 |         if not tgt_folder and self.file_using_filename:
71 |             tgt_folder = self._get_matching_folder(filename)
72 | 
73 |         tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder)
74 |         return tgt_file
75 |         
76 | if __name__ == '__main__':
77 |     p = PyPdfFiler(PyFilerDirs())
78 |     for page_text in p.iter_pdf_page_text("scan_ocr.pdf"):
79 |         print (page_text)
80 | 
81 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | # Copyright 2013 Virantha Ekanayake All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | 
 19 | """
 20 |     Wrap ImageMagick calls.  Yes, this is ugly.
 21 | """
 22 | 
 23 | import subprocess
 24 | import sys, os
 25 | import logging
 26 | import glob
 27 | import functools
 28 | import signal
 29 | 
 30 | from multiprocessing import Pool
 31 | from pypdfocr_interrupts import init_worker
 32 | 
 33 | # Ugly hack to pass in object method to the multiprocessing library
 34 | # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90
 35 | # Basically gets passed in a pair of (self, arg), and calls the method
 36 | def unwrap_self(arg, **kwarg):
 37 |     return PyPreprocess._run_preprocess(*arg, **kwarg)
 38 | 
 39 | 
 40 | 
 41 | class PyPreprocess(object):
 42 |     """Class to wrap all the ImageMagick convert calls"""
 43 |     def __init__(self, config):
 44 |         self.msgs = {
 45 |                 'CV_FAILED': 'convert execution failed',
 46 |             }
 47 |         self.threads = config.get('threads', 4)
 48 | 
 49 |     def _warn(self, msg): # pragma: no cover
 50 |         print("WARNING: %s" % msg)
 51 | 
 52 |     def cmd(self, cmd_list):
 53 |         if isinstance(cmd_list, list):
 54 |             cmd_list = ' '.join(cmd_list)
 55 |         logging.debug("Running cmd: %s" % cmd_list)
 56 |         try:
 57 |             out = subprocess.check_output(cmd_list, stderr=subprocess.STDOUT, shell=True)
 58 |             logging.debug(out)
 59 |             return out
 60 |         except subprocess.CalledProcessError as e:
 61 |             print e.output
 62 |             self._warn("Could not run command %s" % cmd_list)
 63 |             
 64 | 
 65 |     def _run_preprocess(self,  in_filename):
 66 |         basename, filext = os.path.splitext(in_filename)
 67 |         out_filename = '%s_preprocess%s' % (basename, filext)
 68 |         #-respect-parenthesis \( -clone 0 -colorspace gray -negate -lat 15x5+5% -contrast-stretch 0 \) -compose copy_opacity -composite -opaque none +matte -modulate 100,50 -adaptive-blur 2.0 -sharpen 0x1 
 69 |         # When using Windows, can't use backslash parenthesis in the shell, so omit the backslash
 70 |         if str(os.name) == 'nt':
 71 |             backslash = ''
 72 |         else:
 73 |             backslash = '\\'
 74 | 
 75 |         c = ['convert',
 76 |                 '"%s"' % in_filename,
 77 |                 '-respect-parenthesis',
 78 |                 #'\\( $setcspace -colorspace gray -type grayscale \\)',
 79 |                 backslash+'(',
 80 |                 '-clone 0',
 81 |                 '-colorspace gray -negate -lat 15x15+5% -contrast-stretch 0',
 82 |                 backslash+') -compose copy_opacity -composite -opaque none +matte -modulate 100,100',
 83 |                 #'-adaptive-blur 1.0',
 84 |                 '-blur 1x1',
 85 |                 #'-selective-blur 4x4+5%',
 86 |                 '-adaptive-sharpen 0x2',
 87 |                 '-negate -define morphology:compose=darken -morphology Thinning Rectangle:1x30+0+0 -negate ',  # Removes vertical lines >=60 pixes, reduces widht of >30 (oherwise tesseract < 3.03 completely ignores text close to vertical lines in a table)
 88 |                 '"%s"' % (out_filename)
 89 |                 ]
 90 |         logging.info("Preprocessing image %s for better OCR" % in_filename)
 91 |         res = self.cmd(c)
 92 |         if res is None:
 93 |             return in_filename
 94 |         else:
 95 |             return out_filename
 96 | 
 97 |     def preprocess(self, in_filenames):
 98 |         fns = in_filenames
 99 | 
100 |         pool = Pool(processes=self.threads, initializer=init_worker)
101 |         try:
102 |             logging.info("Starting preprocessing parallel execution")
103 |             preprocessed_filenames = pool.map(unwrap_self,zip([self]*len(fns),fns))
104 |             pool.close()
105 |         except KeyboardInterrupt or Exception:
106 |             print("Caught keyboard interrupt... terminating")
107 |             pool.terminate()
108 |             #sys,exit(-1)
109 |             raise
110 |         finally:
111 |             pool.join()
112 |             logging.info ("Completed preprocessing")
113 | 
114 |         return preprocessed_filenames
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_tesseract.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | # Copyright 2013 Virantha Ekanayake All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | 
 18 | """
 19 |    Run Tesseract to generate hocr file 
 20 | """
 21 | 
 22 | import os, sys
 23 | import logging
 24 | import subprocess
 25 | import glob
 26 | from subprocess import CalledProcessError
 27 | 
 28 | from multiprocessing import Pool
 29 | from pypdfocr_interrupts import init_worker
 30 | 
 31 | def error(text):
 32 |     print("ERROR: %s" % text)
 33 |     sys.exit(-1)
 34 | 
 35 | # Ugly hack to pass in object method to the multiprocessing library
 36 | # From http://www.rueckstiess.net/research/snippets/show/ca1d7d90
 37 | # Basically gets passed in a pair of (self, arg), and calls the method
 38 | def unwrap_self(arg, **kwarg):
 39 |     return PyTesseract.make_hocr_from_pnm(*arg, **kwarg)
 40 | 
 41 | class PyTesseract(object):
 42 |     """Class to wrap all the tesseract calls"""
 43 |     def __init__(self, config):
 44 |         """
 45 |            Detect windows tesseract location.  
 46 |         """
 47 |         self.lang = 'eng'
 48 |         self.required = "3.02.02"
 49 |         self.threads = config.get('threads',4)
 50 | 
 51 |         if "binary" in config:  # Override location of binary
 52 |             binary = config['binary']
 53 |             if os.name == 'nt':
 54 |                 binary = '"%s"' % binary
 55 |                 binary = binary.replace("\\", "\\\\")
 56 |             logging.info("Setting location for tesseracdt executable to %s" % (binary))
 57 |         else:
 58 |             if str(os.name) == 'nt':
 59 |                 # Explicit str here to get around some MagicMock stuff for testing that I don't quite understand
 60 |                 binary = '"c:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe"'
 61 |             else:
 62 |                 binary = "tesseract"
 63 | 
 64 |         self.binary = binary
 65 | 
 66 |         self.msgs = {
 67 |             'TS_MISSING': """ 
 68 |                 Could not execute %s
 69 |                 Please make sure you have Tesseract installed correctly
 70 |                 """ % self.binary,
 71 |             'TS_VERSION':'Tesseract version is too old',
 72 |             'TS_img_MISSING':'Cannot find specified tiff file',
 73 |             'TS_FAILED': 'Tesseract-OCR execution failed!',
 74 |         }
 75 | 
 76 | 
 77 |     def _is_version_uptodate(self):
 78 |         """
 79 |             Make sure the version is current 
 80 |         """
 81 |         logging.info("Checking tesseract version")
 82 |         cmd = '%s -v' % (self.binary)
 83 |         logging.info(cmd)        
 84 |         try:
 85 |             ret_output = subprocess.check_output(cmd, shell=True,  stderr=subprocess.STDOUT)
 86 |         except CalledProcessError:
 87 |             # Could not run tesseract
 88 |             error(self.msgs['TS_MISSING'])
 89 | 
 90 |         ver_str = '0.0.0'
 91 |         for line in ret_output.splitlines():
 92 |             if 'tesseract' in line:
 93 |                 ver_str = line.split(' ')[1]
 94 |                 if ver_str.endswith('dev'): # Fix for version strings that end in 'dev'
 95 |                     ver_str = ver_str[:-3]
 96 | 
 97 |         # Iterate through the version dots
 98 |         ver = [int(x) for x in ver_str.split('.')]
 99 |         req = [int(x) for x in self.required.split('.')]
100 | 
101 |         # Aargh, in windows 3.02.02 is reported as version 3.02  
102 |         # SFKM
103 |         if str(os.name) == 'nt':
104 |             req = req[:2]
105 | 
106 |         version_good = False
107 |         for i,num in enumerate(req):
108 |             if len(ver) < i+1:
109 |                 # This minor version number is not present in tesseract, so it must be
110 |                 # lower than required.  (3.02 < 3.02.01)
111 |                 break
112 |             if ver[i]==num and len(ver) == i+1 and len(ver)==len(req):
113 |                 # 3.02.02 == 3.02.02
114 |                 version_good = True
115 |                 continue
116 |             if ver[i]>num:
117 |                 # 4.0 > 3.02.02
118 |                 # 3.03.02 > 3.02.02
119 |                 version_good = True
120 |                 break
121 |             if ver[i]<num:
122 |                 # 3.01.02 < 3.02.02
123 |                 break
124 |             
125 |         return version_good, ver_str
126 | 
127 |     def _warn(self, msg): # pragma: no cover
128 |         print("WARNING: %s" % msg)
129 | 
130 | 
131 |     def make_hocr_from_pnms(self, fns):
132 |         uptodate,ver =  self._is_version_uptodate()
133 |         if not uptodate:
134 |             error(self.msgs['TS_VERSION']+ " (found %s, required %s)" % (ver, self.required))
135 | 
136 |         # Glob it
137 |         #fns = glob.glob(img_filename)
138 |         logging.debug("Making pool for tesseract")
139 |         pool = Pool(processes=self.threads, initializer=init_worker)
140 | 
141 |         try:
142 |             hocr_filenames = pool.map(unwrap_self, zip([self]*len(fns), fns))
143 |             pool.close()
144 |         except KeyboardInterrupt or Exception:
145 |             print("Caught keyboard interrupt... terminating")
146 |             pool.terminate()
147 |             raise
148 |         finally:
149 |             pool.join()
150 | 
151 |         return zip(fns,hocr_filenames)
152 | 
153 | 
154 |     def make_hocr_from_pnm(self, img_filename):
155 | 
156 |         basename,filext = os.path.splitext(img_filename)
157 |         hocr_filename = "%s.html" % basename
158 | 
159 |         if not os.path.exists(img_filename):
160 |             error(self.msgs['TS_img_MISSING'] + " %s" % (img_filename))
161 | 
162 |         logging.info("Running OCR on %s to create %s.html" % (img_filename, basename))
163 |         cmd = '%s "%s" "%s" -psm 1 -c hocr_font_info=1 -l %s hocr' % (self.binary, img_filename, basename, self.lang)
164 |         logging.info(cmd)
165 |         try:
166 |             ret_output = subprocess.check_output(cmd, shell=True,  stderr=subprocess.STDOUT)
167 |         except subprocess.CalledProcessError as e:
168 |             # Could not run tesseract
169 |             print e.output
170 |             self._warn (self.msgs['TS_FAILED'])
171 |                 
172 |         if os.path.isfile(hocr_filename):
173 |             # Output format is html for old versions of tesseract
174 |             logging.info("Created %s.html" % basename)
175 |             return hocr_filename
176 |         else:
177 |             # Try changing extension to .hocr for tesseract 3.03 and higher
178 |             hocr_filename = "%s.hocr" % basename
179 |             if os.path.isfile(hocr_filename):
180 |                 logging.info("Created %s.hocr" % basename)
181 |                 return hocr_filename
182 |             else:
183 |                 error(self.msgs['TS_FAILED'])
184 |             
185 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_util.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright 2015 Virantha Ekanayake All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import logging
16 | import time
17 | """
18 |     Various utility classes
19 | """
20 | 
21 | class Retry(object):
22 | 
23 |     def __init__ (self, func, tries=3, pause=1):
24 |         self.func = func
25 |         self.tries = tries
26 |         self.pause = pause
27 |        
28 |     def call_with_retry(self):
29 |         tries = self.tries
30 |        
31 |         val = None
32 |         while tries > 0:
33 |             try:
34 |                 val = self.func()
35 |                 tries = 0
36 |             except Exception as e:
37 |                 logging.exception("intermediate failure")
38 |                 logging.info("Retrying (tries left %d)" % (tries-1))
39 |                 time.sleep(self.pause)
40 |                 tries -= 1
41 |                 if tries == 0:
42 |                     raise e
43 | 
44 |         return val
45 |                 
46 | 
47 | 
48 | class ExecutableSearcher(object):
49 | 
50 |     pass
51 | 
52 | 
53 | class WindowsExecutableSearcher(ExecutableSearcher):
54 | 
55 |     def __init__(self, possible_dir_names, possible_exe_names):
56 |         """
57 | 
58 |         """
59 |         if not exe_name.endswith('exe'):
60 |             self.exe_name = exe_name+'.exe'
61 |         else:
62 |             self.exe_name = exe_name
63 | 
64 |     def find(self, root):
65 |         """ 
66 |             Search below root for the given executable
67 |         """
68 |         found_exe = self.exe_name
69 | 
70 |         if os.path.exists(root):
71 |             cwd = os.getcwd()
72 |             os.chdir(root)
73 |             for root, dirs, files in os.walk('.', topdown=True):
74 |                 pass
75 | 
76 |         return found_exe
77 | 
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/pypdfocr/pypdfocr_watcher.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Something
  3 | """
  4 | 
  5 | import sys, os
  6 | import re
  7 | import logging
  8 | import shutil
  9 | import time
 10 | import glob
 11 | 
 12 | from threading import Lock
 13 | 
 14 | from watchdog.observers import Observer
 15 | from watchdog.events import LoggingEventHandler
 16 | from watchdog.events import FileSystemEventHandler
 17 | 
 18 |         
 19 | class PyPdfWatcher(FileSystemEventHandler):
 20 |     """
 21 |         Watch a folder for new pdf files.
 22 | 
 23 |         If new file event, then add it to queue with timestamp.
 24 |         If file mofified event, then change timestamp in queue.
 25 |         Every few seconds pop-off queue and if timestamp older than 3 seconds,
 26 |         process the file else, push it back onto queue.
 27 |     """
 28 |     events = {}
 29 |     events_lock = Lock()
 30 | 
 31 |     def __init__(self, monitor_dir, config):
 32 |         FileSystemEventHandler.__init__(self)
 33 | 
 34 |         self.monitor_dir = monitor_dir
 35 |         if not config: config = {}
 36 | 
 37 |         self.scan_interval = config.get('scan_interval', 3) # If no updates in 3 seconds (or user specified option in config file) process file
 38 | 
 39 |     def start(self):
 40 |         self.observer = Observer()
 41 |         self.observer.schedule(self, self.monitor_dir)
 42 |         self.observer.start()
 43 |         print("Starting to watch for new pdfs in %s" % (self.monitor_dir))
 44 |         while True:
 45 |             logging.info("Sleeping for %d seconds" % self.scan_interval)
 46 |             time.sleep(self.scan_interval)
 47 |             newFile = self.check_queue()
 48 |             if newFile:
 49 |                 yield newFile
 50 |         self.observer.join()
 51 |             
 52 | 
 53 |     def stop(self):
 54 |         self.observer.stop()
 55 |         
 56 |     def rename_file_with_spaces(self, pdf_filename):
 57 |         """
 58 |             Rename any portion of a filename that has spaces in the basename with underscores.
 59 |             Does not affect spaces in the directory path.
 60 | 
 61 |             :param pdf_filename: Filename to remove spaces
 62 |             :type pdf_filename: string
 63 |             :returns: Modified filename
 64 |             :rtype: string
 65 |         """
 66 |         filepath, filename = os.path.split(pdf_filename)
 67 |         if ' ' in filename:
 68 |             newFilename = os.path.join(filepath, filename.replace(' ','_'))
 69 |             logging.debug("Renaming spaces")
 70 |             logging.debug("---> %s \n ------> %s" % (pdf_filename, newFilename))
 71 |             shutil.move(pdf_filename, newFilename) 
 72 |             return newFilename
 73 |         else:
 74 |             return pdf_filename
 75 | 
 76 |     def check_for_new_pdf(self,ev_path):
 77 |         """
 78 |             Called by the file watching api on any file creations/modifications.
 79 |             For any file ending with ".pdf", but not "_ocr.pdf", it adds new files
 80 |             to the event queue with the current time stamp, or it updates existing files in
 81 |             the queue with the current timestamp.  This queue is used to track files and
 82 |             keep track of their last "touched" time, so we can start processing a file if
 83 |             :func:`check_queue` finds a file that hasn't been touched in a while.
 84 | 
 85 |             If the file does note exist in the events dict:
 86 | 
 87 |                 - Add it with the current time
 88 | 
 89 |             Otherwise:
 90 |                 
 91 |                 - If the file time is marked as -1, delete it from the dict
 92 |                 - Else, update the time in the dict to the current time
 93 | 
 94 |         """
 95 |         if ev_path.endswith(".pdf"):
 96 |             if not ev_path.endswith(("_ocr.pdf", "_test.pdf")):
 97 |                 PyPdfWatcher.events_lock.acquire()
 98 |                 if not ev_path in PyPdfWatcher.events:
 99 |                     PyPdfWatcher.events[ev_path] = time.time()
100 |                     logging.info ("Adding %s to event queue" % ev_path)
101 |                 else:
102 |                     if PyPdfWatcher.events[ev_path] == -1:
103 |                         logging.info ( "%s removing from event queue" % (ev_path))
104 |                         del PyPdfWatcher.events[ev_path]
105 |                     else: 
106 |                         newTime = time.time()
107 |                         logging.debug ( "%s already in event queue, updating timestamp to %d" % (ev_path, newTime))
108 |                         PyPdfWatcher.events[ev_path]  = newTime
109 |                 PyPdfWatcher.events_lock.release()
110 | 
111 |                       
112 |               
113 |     def on_created(self, event):
114 |         logging.debug ("on_created: %s at time %d" % (event.src_path, time.time()))
115 |         self.check_for_new_pdf(event.src_path)
116 | 
117 |     def on_moved(self, event):
118 |         logging.debug ("on_moved: %s" % event.src_path)
119 |         self.check_for_new_pdf(event.dest_path)
120 | 
121 |     def on_modified(self, event):
122 |         logging.debug ("on_modified: %s" % event.src_path)
123 |         self.check_for_new_pdf(event.src_path)
124 | 
125 |     def check_queue(self):
126 |         """
127 |             This function is called at regular intervals by :func:`start`.
128 |             
129 |             Iterate through the events, and if there is any with a timestamp
130 |             greater than the scan_interval, return it and set its timestamp to -1
131 |             for purging later.
132 | 
133 |             :returns: Filename if available to process, otherwise None.
134 |         """
135 |         now = time.time()
136 |         PyPdfWatcher.events_lock.acquire()
137 |         for monitored_file, timestamp in PyPdfWatcher.events.items():
138 |             if timestamp == -1:
139 |                 del PyPdfWatcher.events[monitored_file]
140 |             elif now - timestamp > self.scan_interval:
141 |                 logging.info("Processing new file %s" % (monitored_file))
142 |                 # Remove this file from the dict
143 |                 del PyPdfWatcher.events[monitored_file]
144 |                 monitored_file = self.rename_file_with_spaces(monitored_file)
145 |                 PyPdfWatcher.events[monitored_file] = -1 # Add back into queue and mark as not needing further action in the event handler
146 |                 PyPdfWatcher.events_lock.release()
147 |                 return monitored_file
148 |         PyPdfWatcher.events_lock.release()
149 |         return None
150 | 
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/pypdfocr/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.9.1"
2 | 


--------------------------------------------------------------------------------
/pypdfocr_windows.spec:
--------------------------------------------------------------------------------
 1 | # -*- mode: python -*-
 2 | a = Analysis(['pypdfocr\\pypdfocr.py'],
 3 |              pathex=['C:\\Users\\Virantha Ekanayake\\dev\\pypdfocr'],
 4 | 	     hiddenimports = [
 5 | 		    'reportlab.pdfbase._fontdata_enc_macexpert',
 6 | 		    'reportlab.pdfbase._fontdata_enc_macroman',
 7 | 		    'reportlab.pdfbase._fontdata_enc_pdfdoc',
 8 | 		    'reportlab.pdfbase._fontdata_enc_standard',
 9 | 		    'reportlab.pdfbase._fontdata_enc_symbol',
10 | 		    'reportlab.pdfbase._fontdata_enc_winansi',
11 | 		    'reportlab.pdfbase._fontdata_enc_zapfdingbats',
12 | 		    'reportlab.pdfbase._fontdata_widths_courier',
13 | 		    'reportlab.pdfbase._fontdata_widths_courierbold',
14 | 		    'reportlab.pdfbase._fontdata_widths_courierboldoblique',
15 | 		    'reportlab.pdfbase._fontdata_widths_courieroblique',
16 | 		    'reportlab.pdfbase._fontdata_widths_helvetica',
17 | 		    'reportlab.pdfbase._fontdata_widths_helveticabold',
18 | 		    'reportlab.pdfbase._fontdata_widths_helveticaboldoblique',
19 | 		    'reportlab.pdfbase._fontdata_widths_helveticaoblique',
20 | 		    'reportlab.pdfbase._fontdata_widths_symbol',
21 | 		    'reportlab.pdfbase._fontdata_widths_timesbold',
22 | 		    'reportlab.pdfbase._fontdata_widths_timesbolditalic',
23 | 		    'reportlab.pdfbase._fontdata_widths_timesitalic',
24 | 		    'reportlab.pdfbase._fontdata_widths_timesroman',
25 | 		    'reportlab.pdfbase._fontdata_widths_zapfdingbats',
26 | 		    'reportlab.rl_settings'],
27 |              hookspath=None,
28 |              runtime_hooks=None)
29 | pyz = PYZ(a.pure)
30 | exe = EXE(pyz,
31 |           a.scripts,
32 |           a.binaries,
33 |           a.zipfiles,
34 |           a.datas,
35 |           name='pypdfocr.exe',
36 |           debug=False,
37 |           strip=None,
38 |           upx=True,
39 |           console=True )
40 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pillow>=2.2
2 | reportlab>=2.7
3 | watchdog>=0.6.0
4 | pypdf2>=1.23
5 | evernote
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from setuptools import setup, find_packages
 3 | 
 4 | import pypdfocr
 5 | import io
 6 | from pypdfocr.version import __version__
 7 | from setuptools import Command
 8 | import os
 9 | 
10 | class PyTest(Command):
11 |     user_options = []
12 |     def initialize_options(self):
13 |         pass
14 |     def finalize_options(self):
15 |         pass
16 |     def run(self):
17 |         import sys,subprocess
18 |         cwd = os.getcwd()
19 |         os.chdir('test')
20 |         errno = subprocess.call([sys.executable, 'runtests.py'])
21 |         os.chdir(cwd)
22 |         raise SystemExit(errno)
23 | 
24 | def read(*filenames, **kwargs):
25 |     encoding = kwargs.get('encoding', 'utf-8')
26 |     sep = kwargs.get('sep', '\n')
27 |     buf = []
28 |     for filename in filenames:
29 |         with io.open(filename, encoding=encoding) as f:
30 |             buf.append(f.read())
31 |     return sep.join(buf)
32 | 
33 | packages = find_packages(exclude="tests")
34 | 
35 | long_description = read('README.rst', 'CHANGES.rst', 'TODO.rst')
36 | 
37 | with open("requirements.txt") as f:
38 |     required = f.read().splitlines()
39 | 
40 | setup (
41 |     name = "pypdfocr",
42 |     version = __version__,
43 |     description="Converts a scanned PDF into an OCR'ed pdf using Tesseract-OCR and Ghostscript",
44 |     license = "ASL 2.0",
45 |     long_description = long_description,
46 |     author="Virantha N. Ekanayake",
47 |     author_email="virantha@gmail.com", # Removed.
48 |     package_data = {'': ['*.xml']},
49 |     zip_safe = True,
50 |     include_package_data = True,
51 |     packages = packages,
52 |     install_requires = required,
53 |     entry_points = {
54 |             'console_scripts': [
55 |                     'pypdfocr = pypdfocr.pypdfocr:main'
56 |                 ],
57 |         },
58 |     options = {
59 | 	    "pyinstaller": {"packages": packages}
60 | 	    },
61 |     cmdclass = {'test':PyTest}
62 | 
63 | )
64 | 


--------------------------------------------------------------------------------
/test/pdfs/1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/1.pdf


--------------------------------------------------------------------------------
/test/pdfs/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test.pdf


--------------------------------------------------------------------------------
/test/pdfs/test_cinderella.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_cinderella.pdf


--------------------------------------------------------------------------------
/test/pdfs/test_patent.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_patent.pdf


--------------------------------------------------------------------------------
/test/pdfs/test_recipe.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_recipe.pdf


--------------------------------------------------------------------------------
/test/pdfs/test_recipe_sideways.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_recipe_sideways.pdf


--------------------------------------------------------------------------------
/test/pdfs/test_sherlock.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_sherlock.pdf


--------------------------------------------------------------------------------
/test/pdfs/test_super_long_keyword.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/pdfs/test_super_long_keyword.pdf


--------------------------------------------------------------------------------
/test/temp/original/test_patent.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_patent.pdf


--------------------------------------------------------------------------------
/test/temp/original/test_patent_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_patent_1.pdf


--------------------------------------------------------------------------------
/test/temp/original/test_recipe.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_recipe.pdf


--------------------------------------------------------------------------------
/test/temp/original/test_recipe_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_recipe_1.pdf


--------------------------------------------------------------------------------
/test/temp/original/test_sherlock.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_sherlock.pdf


--------------------------------------------------------------------------------
/test/temp/original/test_sherlock_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/virantha/pypdfocr/acc5e13763224267e897865fccafbf51e13725e9/test/temp/original/test_sherlock_1.pdf


--------------------------------------------------------------------------------
/test/test_evernote.py:
--------------------------------------------------------------------------------
  1 | #from pypdfocr import PyPDFOCR as P
  2 | import pypdfocr.pypdfocr_filer_evernote as P
  3 | import pytest
  4 | import os
  5 | 
  6 | import evernote.api.client
  7 | import evernote.edam.type.ttypes as Types
  8 | import hashlib
  9 | 
 10 | from mock import patch, call
 11 | 
 12 | class TestEvernote:
 13 | 
 14 |     def test_connecct(self):
 15 |         # Tricky mocking.  Need to mock the EvernoteClient import in pypdfocr_filer_evernote.py file
 16 |         with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client:
 17 |             p = P.PyFilerEvernote("TOKEN")
 18 |             inst = mock_evernote_client.return_value
 19 |             assert(inst.get_user_store.called)
 20 | 
 21 |     @patch('shutil.move')
 22 |     def test_file_original(self, mock_move):
 23 |         with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client:
 24 |             p = P.PyFilerEvernote("TOKEN")
 25 |             filename = os.path.join("pdfs","test_recipe.pdf")
 26 | 
 27 |             # First, test code that does not move original
 28 |             p.file_original(filename)
 29 |             assert (not mock_move.called)
 30 | 
 31 |             # Now test moving
 32 |             p.set_original_move_folder(os.path.join("temp", "original"))
 33 |             p.file_original(filename)
 34 |             mock_move.assert_called_with(filename, os.path.join("temp","original", "test_recipe_2.pdf"))
 35 | 
 36 |     @patch('os.remove')
 37 |     def test_move_to_folder(self, mock_remove):
 38 |         with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client:
 39 |             p = P.PyFilerEvernote("TOKEN")
 40 | 	    filename = os.path.join("pdfs", "test_recipe.pdf")
 41 |             foldername = 'recipe'
 42 |             with pytest.raises(AssertionError):
 43 |                 p.move_to_matching_folder(filename, foldername)
 44 |             p.set_target_folder('target')
 45 |             with pytest.raises(AssertionError):
 46 |                 p.move_to_matching_folder(filename, foldername)
 47 |             p.set_default_folder('default')
 48 |             p.move_to_matching_folder(filename, None)
 49 |             p.move_to_matching_folder(filename, foldername)
 50 |             
 51 |             mock_client = mock_evernote_client.return_value
 52 |             assert(mock_client.get_note_store.called)
 53 |             assert(mock_client.get_note_store.return_value.createNote.called)
 54 |             mock_remove.assert_called_with(filename)
 55 | 
 56 |             
 57 | 
 58 | 
 59 |     def test_create_note(self):
 60 |         with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client:
 61 |             p = P.PyFilerEvernote("TOKEN")
 62 |             notebook = Types.Notebook()
 63 |             notebook.name = "recipe"
 64 |             filename = "pdfs/test_recipe.pdf"
 65 |             note = p._create_evernote_note(notebook, filename)
 66 |             xml = '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">'
 67 |             assert(note.content.startswith(xml))
 68 | 
 69 |             md5 = hashlib.md5()
 70 |             with open(filename,'rb') as f: 
 71 |                 pdf_bytes = f.read()
 72 |                 md5.update(pdf_bytes)
 73 | 
 74 |             md5hash = md5.hexdigest()
 75 |             
 76 |             assert(md5hash in note.content)
 77 |             assert(note.resources[0].data.bodyHash == md5hash)
 78 | 
 79 | 
 80 |     def test_check_notebook(self):
 81 |         with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client:
 82 |             p = P.PyFilerEvernote("TOKEN")
 83 |             p._check_and_make_notebook("new_notebook")
 84 |             # Let's assert that we tried to create a new notebook
 85 |             mock_client = mock_evernote_client.return_value
 86 |             assert(mock_client.get_note_store.called)
 87 |             create_func = mock_client.get_note_store.return_value.createNotebook
 88 |             update_func = mock_client.get_note_store.return_value.updateNotebook
 89 |             assert(create_func.called)
 90 |             assert(not update_func.called)
 91 |             notebook = create_func.call_args[0][0]
 92 |             assert(notebook.name == 'new_notebook')
 93 | 
 94 |             # Now, let's setup a value for the notebooks, so we test the code for
 95 |             # a "pre-exisiting" notebook
 96 |             test_notebook = Types.Notebook()
 97 |             test_notebook.name = "new_notebook"
 98 |             mock_client.get_note_store.return_value.listNotebooks.return_value = [test_notebook]
 99 |             p._check_and_make_notebook("new_notebook")
100 | 
101 |             # Now check that the code to update a notebook stack is correct
102 |             test_notebook.stack = "new_stack"
103 |             update_func = mock_client.get_note_store.return_value.updateNotebook
104 |             p.set_target_folder("Boogie")
105 |             p._check_and_make_notebook("new_notebook")
106 |             # Check that the update call was called with correct arguments
107 |             assert(update_func.called)
108 |             notebook = update_func.call_args[0][0]
109 |             assert(notebook.stack == 'Boogie')
110 |             
111 | 
112 |     def test_add_folder_target(self):
113 |         with patch("pypdfocr.pypdfocr_filer_evernote.EvernoteClient") as mock_evernote_client:
114 |             p = P.PyFilerEvernote("TOKEN")
115 |             p.add_folder_target("folder1", ["target1", "target2"])
116 |             with pytest.raises(AssertionError):
117 |                 p.add_folder_target("folder1", ["target1", "target2"])
118 |             p.add_folder_target("folder2", ["target1", "target2"])
119 |             assert("folder1" in p.folder_targets.keys())
120 |             assert("folder2" in p.folder_targets.keys())
121 | 
122 |                     
123 |     
124 |         
125 | 


--------------------------------------------------------------------------------
/test/test_gs.py:
--------------------------------------------------------------------------------
 1 | #from pypdfocr import PyPDFOCR as P
 2 | import pypdfocr.pypdfocr_gs as P
 3 | import pytest
 4 | import os
 5 | 
 6 | import hashlib
 7 | 
 8 | from mock import patch, call
 9 | from pytest import skip
10 | 
11 | class TestGS:
12 | 
13 |     @pytest.mark.skipif(os.name!='nt', reason="Not on NT")
14 |     @patch('os.name')
15 |     @patch('subprocess.check_output')
16 |     def test_gs_set_nt(self, mock_subprocess, mock_os_name):
17 |         """
18 |             Check that we have a exe on windows
19 |         """
20 |         mock_os_name.__str__.return_value = 'nt'
21 |         p = P.PyGs({})
22 | 
23 |         assert 'gswin' in p.binary
24 | 
25 |     @pytest.mark.skipif(os.name!='nt', reason="Not on NT")
26 |     @patch('os.name')
27 |     @patch('subprocess.call')
28 |     def test_gs_run_nt(self, mock_subprocess, mock_os_name, capsys):
29 |         """
30 |             Stupid test because Windows Tesseract only returns 3.02 instead of 3.02.02
31 |         """
32 |         mock_os_name.__str__.return_value = 'nt'
33 |         p = P.PyGs({})
34 | 
35 |         mock_subprocess.return_value = -1
36 |         p.binary = 'gsblah.exe'
37 |         with pytest.raises(SystemExit):
38 |             p._run_gs("","","")
39 | 
40 |         out,err = capsys.readouterr()
41 |         assert p.msgs['GS_FAILED'] in out
42 | 
43 |     def test_gs_pdf_missing(self, capsys):
44 |         p = P.PyGs({})
45 |         with pytest.raises(SystemExit):
46 |             p.make_img_from_pdf("missing123.pdf")
47 |         out,err = capsys.readouterr()
48 |         assert p.msgs['GS_MISSING_PDF'] in out
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/test/test_option_config.yaml:
--------------------------------------------------------------------------------
1 | target_folder: "blah"
2 | 


--------------------------------------------------------------------------------
/test/test_option_parsing.py:
--------------------------------------------------------------------------------
  1 | #from pypdfocr import PyPDFOCR as P
  2 | import pypdfocr.pypdfocr as P
  3 | import pytest
  4 | 
  5 | 
  6 | class TestOptions:
  7 | 
  8 |     def setup(self):
  9 |         self.p = P.PyPDFOCR()
 10 | 
 11 | 
 12 |     def test_standalone(self):
 13 |         opts = ["blah.pdf"]
 14 |         self.p.get_options(opts)
 15 | 
 16 |         opts.append('-d')
 17 |         self.p.get_options(opts)
 18 |         assert(self.p.debug)
 19 | 
 20 |         opts.append('-v')
 21 |         self.p.get_options(opts)
 22 |         assert(self.p.verbose)
 23 | 
 24 |         opts.append('--preprocess')
 25 |         self.p.get_options(opts)
 26 |         assert(not self.p.skip_preprocess)
 27 | 
 28 |         assert(not self.p.enable_filing)
 29 |         assert(self.p.config == {})
 30 | 
 31 |     def test_standalone_filing(self):
 32 |         opts = ["blah.pdf"]
 33 |         opts.append('-f')
 34 | 
 35 |         # Assert that filing option requires a config file
 36 |         with pytest.raises(SystemExit):
 37 |             self.p.get_options(opts)
 38 | 
 39 |         # Assert that it checks that the config file is present
 40 |         opts.append('--config=test_option_config.yaml')
 41 |         self.p.get_options(opts)
 42 |         assert(self.p.enable_filing)
 43 |         assert(self.p.config)
 44 | 
 45 |     def test_standalone_filing_evernote(self):
 46 |         # Check when evernote is enabled
 47 |         opts = ["blah.pdf"]
 48 |         opts.append('-e')
 49 |         # Assert that it checks that the config file is present
 50 |         with pytest.raises(SystemExit):
 51 |             self.p.get_options(opts)
 52 | 
 53 |         opts.append('--config=test_option_config.yaml')
 54 |         self.p.get_options(opts)
 55 |         # Enabling -e should turn on filing too
 56 |         assert(self.p.enable_filing)
 57 |         assert(self.p.enable_evernote)
 58 |         assert(self.p.config)
 59 |         assert(not self.p.watch)
 60 | 
 61 |         opts.append('-f')
 62 |         self.p.get_options(opts)
 63 |         assert(self.p.enable_filing)
 64 |         assert(self.p.enable_evernote)
 65 |         assert(self.p.config)
 66 |         assert(not self.p.watch)
 67 | 
 68 |     def test_standalone_watch_conflict(self):
 69 |         # When pdf file is specified, we don't want to allow watch option
 70 |         opts = ["blah.pdf", '-w']
 71 |         with pytest.raises(SystemExit):
 72 |             self.p.get_options(opts)
 73 |                 
 74 |     def test_watch_filing(self):
 75 |         opts = ['-w']
 76 |         # Catch watch without a dir
 77 |         with pytest.raises(SystemExit):
 78 |             self.p.get_options(opts)
 79 | 
 80 |         opts = ['-w temp']
 81 |         self.p.get_options(opts)
 82 |         assert(self.p.watch_dir)
 83 | 
 84 |         opts.append('--config=test_option_config.yaml')
 85 |         self.p.get_options(opts)
 86 |         assert(self.p.watch)
 87 |         assert(self.p.config)
 88 |         assert(not self.p.enable_filing)
 89 |         assert(not self.p.enable_evernote)
 90 | 
 91 |     def test_watch_filing_evernote(self):
 92 |         opts = ['-w temp', '-e', '--config=test_option_config.yaml']
 93 |         self.p.get_options(opts)
 94 |         assert(self.p.watch)
 95 |         assert(self.p.config)
 96 |         assert(self.p.enable_filing)
 97 |         assert(self.p.enable_evernote)
 98 | 
 99 |         opts = ['-w temp', '-f', '-e',  '--config=test_option_config.yaml']
100 |         self.p.get_options(opts)
101 |         assert(self.p.watch)
102 |         assert(self.p.config)
103 |         assert(self.p.enable_filing)
104 |         assert(self.p.enable_evernote)
105 | 
106 | 


--------------------------------------------------------------------------------
/test/test_pdf_filer.py:
--------------------------------------------------------------------------------
 1 | #from pypdfocr import PyPDFOCR as P
 2 | import pypdfocr.pypdfocr as P
 3 | import pytest
 4 | import os
 5 | 
 6 | import hashlib
 7 | 
 8 | from mock import patch, call
 9 | from pytest import skip
10 | 
11 | class TestPDFFiler:
12 | 
13 |     @patch('shutil.move')
14 |     def test_file_by_filename(self, mock_move):
15 |         """
16 |             Test filing of single pdf based on filename.
17 |         """
18 | 
19 |         # Mock the move function so we don't actually end up filing
20 |         p = P.PyPDFOCR()
21 |         cwd = os.getcwd()
22 |         filename = os.path.join("pdfs", "test_super_long_keyword.pdf")
23 |         out_filename = filename.replace(".pdf", "_ocr.pdf")
24 | 
25 |         if os.path.exists(out_filename):
26 |             os.remove(out_filename)
27 | 
28 |         print("Current directory: %s" % os.getcwd())
29 |         #opts = [filename, "--config=test_pypdfocr_config.yaml", "-f"]
30 |         opts = [filename, "--config=test_pypdfocr_config_filename.yaml", "-f", "-n"]
31 |         p.go(opts)
32 | 
33 |         assert(os.path.exists(out_filename))
34 |         os.remove(out_filename)
35 | 
36 |         calls = [call(out_filename, os.path.abspath(os.path.join('temp', 'target','recipe', os.path.basename(out_filename))))]
37 |         mock_move.assert_has_calls(calls)
38 | 
39 | 
40 | 
41 |         
42 | 


--------------------------------------------------------------------------------
/test/test_pypdfocr.py:
--------------------------------------------------------------------------------
  1 | #from pypdfocr import PyPDFOCR as P
  2 | import pypdfocr.pypdfocr as P
  3 | import pytest
  4 | import os
  5 | import logging
  6 | 
  7 | from PyPDF2 import PdfFileReader
  8 | import smtplib
  9 | from mock import Mock
 10 | from mock import patch, call
 11 | from mock import MagicMock
 12 | from mock import PropertyMock
 13 | 
 14 | 
 15 | class TestPydfocr:
 16 | 
 17 |     def setup(self):
 18 |         self.p = P.PyPDFOCR()
 19 | 
 20 |     def _iter_pdf(self, filename):
 21 |         with open(filename, 'rb') as f:
 22 |             reader = PdfFileReader(f)
 23 |             logging.debug("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
 24 |             for pgnum in range(reader.getNumPages()):
 25 |                 text = reader.getPage(pgnum).extractText()
 26 |                 text = text.encode('ascii', 'ignore')
 27 |                 text = text.replace('\n', ' ')
 28 |                 yield text
 29 |     
 30 |     pdf_tests = [
 31 |             (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe.pdf"), [ ["Simply Recipes"],
 32 |                                  ]),
 33 |         (".", os.path.join("temp","target","patents"), os.path.join("pdfs","test_patent.pdf"), [ 
 34 |                            ["asynchronous", "subject to", "20 Claims"], # Page 1
 35 |                            ["FOREIGN PATENT" ], # Page 2
 36 |                             ]),
 37 |         (".", os.path.join("temp","target", "default"), os.path.join("pdfs","test_sherlock.pdf"), [ ["Bohemia", "Trincomalee"], # Page 1
 38 |                            ["hundreds of times" ], # Page 2
 39 |                            ]),
 40 |         ("pdfs", os.path.join("temp","target","default"), "test_sherlock.pdf", [ ["Bohemia", "Trincomalee"], # Page 1
 41 |                            ["hundreds of times" ], # Page 2
 42 |                            ]),
 43 |             (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "1.pdf"), [ ["Simply","Recipes"],
 44 |                                  ]),
 45 |             (".", os.path.join("temp","target","recipe"), os.path.join("..","test", "pdfs", "test_recipe_sideways.pdf"), [ ["Simply","Recipes", 'spinach'],
 46 |                                  ]),
 47 |         ]
 48 | 
 49 |     #@pytest.mark.skipif(True, reason="Just testing")
 50 |     @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests)
 51 |     def test_standalone(self, dirname, tgt_folder, filename, expected):
 52 |         """
 53 |             Test the single file conversion with no filing.  
 54 |             Tests relative paths (".."), files in subirs, and files in current dir
 55 |             Checks for that _ocr file is created and keywords found in pdf.
 56 |             Modify :attribute:`pdf_tests` for changing keywords, etc
 57 | 
 58 |             :param expected: List of keywords lists per page.  expected[0][1] is the second keyword to assert on page 1
 59 |         """
 60 |         # Run a single file conversion
 61 | 
 62 |         # First redo the unix-style paths, in case we're running on windows
 63 |         # Assume paths in unix-style
 64 |         dirname = os.path.join(*(dirname.split("/")))
 65 |         tgt_folder = os.path.join(*(tgt_folder.split("/")))
 66 |         filename = os.path.join(*(filename.split("/")))
 67 | 
 68 | 
 69 |         cwd = os.getcwd()
 70 |         os.chdir(dirname)
 71 |         opts = [filename, '--skip-preprocess']
 72 |         self.p.go(opts)
 73 | 
 74 |         out_filename = filename.replace(".pdf", "_ocr.pdf")
 75 |         assert(os.path.exists(out_filename))
 76 |         for i,t in enumerate(self._iter_pdf(out_filename)):
 77 |             if len(expected) > i:
 78 |                 for keyword in expected[i]:
 79 |                     assert(keyword in t)
 80 |             print ("\n----------------------\nPage %d\n" % i)
 81 |             print t
 82 |         os.remove(out_filename)
 83 |         os.chdir(cwd)
 84 | 
 85 |     #@pytest.mark.skipif(True, reason="just testing")
 86 |     @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", [pdf_tests[0]])
 87 |     def test_standalone_email(self, dirname, tgt_folder, filename, expected):
 88 |         """
 89 |             Get coverage on the email after conversion of a single file.
 90 |             Use mock to stub out the smtpllib
 91 |         """
 92 |         # Run a single file conversion
 93 | 
 94 |         # Mock the smtplib to test the email functions
 95 |         with patch("smtplib.SMTP") as mock_smtp:
 96 |             cwd = os.getcwd()
 97 |             os.chdir(dirname)
 98 |             opts = [filename, "--preprocess", "--config=test_pypdfocr_config.yaml", "-m"]
 99 |             self.p.go(opts)
100 | 
101 |             out_filename = filename.replace(".pdf", "_ocr.pdf")
102 |             assert(os.path.exists(out_filename))
103 |             for i,t in enumerate(self._iter_pdf(out_filename)):
104 |                 if len(expected) > i:
105 |                     for keyword in expected[i]:
106 |                         assert(keyword in t)
107 |                 print ("\n----------------------\nPage %d\n" % i)
108 |                 print t
109 |             os.remove(out_filename)
110 |             os.chdir(cwd)
111 |             
112 |             # Assert the smtp calls
113 |             instance = mock_smtp.return_value
114 |             assert(instance.starttls.called)
115 |             instance.login.assert_called_once_with("someone@gmail.com", "blah")
116 |             assert(instance.sendmail.called)
117 | 
118 |     @patch('shutil.move')
119 |     @pytest.mark.parametrize("config", [("test_pypdfocr_config.yaml"), ("test_pypdfocr_config_no_move_original.yaml")])
120 |     @pytest.mark.parametrize("dirname, tgt_folder, filename, expected", pdf_tests[0:3])
121 |     def test_standalone_filing(self, mock_move, config, dirname, tgt_folder, filename, expected):
122 |         """
123 |             Test filing of single pdf.  Also test moving of original file.
124 | 
125 |             Kind of hacked up right now, but it tries to test a lot of things (maybe too many)
126 |         """
127 | 
128 |         # Mock the move function so we don't actually end up filing
129 |         cwd = os.getcwd()
130 |         if os.path.exists("temp"):
131 |             os.chdir("temp")
132 |             for d in [os.path.join('target', 'patents'), os.path.join('target','recipe')]:
133 |                 if os.path.exists(d):
134 |                     os.removedirs(d)
135 |             os.chdir(cwd)
136 | 
137 |         os.chdir(dirname)
138 |         print("Current direcxtory: %s" % os.getcwd())
139 |         #opts = [filename, "--config=test_pypdfocr_config.yaml", "-f"]
140 |         opts = [filename, '--skip-preprocess', "--config=%s" % config, "-f"]
141 |         self.p.go(opts)
142 | 
143 |         out_filename = filename.replace(".pdf", "_ocr.pdf")
144 |         assert(os.path.exists(out_filename))
145 |         for i,t in enumerate(self._iter_pdf(out_filename)):
146 |             if len(expected) > i:
147 |                 for keyword in expected[i]:
148 |                     assert(keyword in t)
149 |             print ("\n----------------------\nPage %d\n" % i)
150 |             print t
151 |         os.remove(out_filename)
152 |         os.chdir(cwd)
153 |         
154 |         # Assert the smtp calls
155 |         calls = [call(out_filename,
156 |                         os.path.abspath(os.path.join(tgt_folder,os.path.basename(out_filename))))]
157 |         if not "no_move_original" in config:
158 |             new_file_name = os.path.basename(filename).replace(".pdf", "_2.pdf")
159 |             calls.append(call(filename,
160 |                                 os.path.abspath(os.path.join("temp","original", new_file_name))))
161 |         mock_move.assert_has_calls(calls)
162 | 
163 |     def test_set_binaries(self):
164 |         """ Test the setup_exteral_tools
165 |         """
166 |         self.p.config = {}
167 |         self.p.config["tesseract"] = {"binary":"/usr/bin/tesseract"}
168 |         self.p.config["ghostscript"] = {"binary":"/usr/bin/ghostscript"}
169 |         self.p._setup_external_tools()
170 |         if not os.name == 'nt':
171 |             assert(self.p.ts.binary == "/usr/bin/tesseract")
172 |             assert(self.p.gs.binary == "/usr/bin/ghostscript")
173 |         else:
174 |             assert(self.p.ts.binary == '"/usr/bin/tesseract"')
175 |             assert(self.p.gs.binary == '"/usr/bin/ghostscript"')
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/test/test_pypdfocr_config.yaml:
--------------------------------------------------------------------------------
 1 | target_folder: "temp/target"
 2 | default_folder: "temp/target/default"
 3 | original_move_folder: "temp/original"
 4 | 
 5 | mail_smtp_server: "smtp.gmail.com:587"
 6 | mail_smtp_login: "someone@gmail.com"
 7 | mail_smtp_password: "blah"
 8 | mail_from_addr: "someone#gmail.com"
 9 | mail_to_list: 
10 |     - "someone@gmail.com"
11 | 
12 | folders:
13 |     recipe:
14 |         - recipes
15 |     patents:
16 |         - patent
17 |         - 2003
18 | 
19 | 


--------------------------------------------------------------------------------
/test/test_pypdfocr_config_filename.yaml:
--------------------------------------------------------------------------------
 1 | target_folder: "temp/target"
 2 | default_folder: "temp/target/default"
 3 | 
 4 | mail_smtp_server: "smtp.gmail.com:587"
 5 | mail_smtp_login: "someone@gmail.com"
 6 | mail_smtp_password: "blah"
 7 | mail_from_addr: "someone#gmail.com"
 8 | mail_to_list: 
 9 |     - "someone@gmail.com"
10 | 
11 | folders:
12 |     recipe:
13 |         - super_long_keyword
14 | 
15 | 


--------------------------------------------------------------------------------
/test/test_pypdfocr_config_no_move_original.yaml:
--------------------------------------------------------------------------------
 1 | target_folder: "temp/target"
 2 | default_folder: "temp/target/default"
 3 | 
 4 | mail_smtp_server: "smtp.gmail.com:587"
 5 | mail_smtp_login: "someone@gmail.com"
 6 | mail_smtp_password: "blah"
 7 | mail_from_addr: "someone#gmail.com"
 8 | mail_to_list: 
 9 |     - "someone@gmail.com"
10 | 
11 | folders:
12 |     recipe:
13 |         - recipes
14 |     patents:
15 |         - patent
16 | 
17 | 


--------------------------------------------------------------------------------
/test/test_tesseract.py:
--------------------------------------------------------------------------------
  1 | #from pypdfocr import PyPDFOCR as P
  2 | import pypdfocr.pypdfocr_tesseract as P
  3 | import pytest
  4 | import os
  5 | 
  6 | import hashlib
  7 | 
  8 | from mock import patch, call
  9 | 
 10 | class TestTesseract:
 11 | 
 12 |     @pytest.mark.skipif(os.name=='nt', reason='Does not work on Windows')
 13 |     def test_version_shorter_older(self):
 14 |         with patch("subprocess.check_output") as mock_subprocess:
 15 |             p = P.PyTesseract({})
 16 |             p.required = "3.02.02"
 17 |             mock_subprocess.return_value = """tesseract 3.02"""
 18 |             uptodate,ver = p._is_version_uptodate()
 19 |             assert (not uptodate)
 20 | 
 21 |     def test_version_minor_older(self):
 22 |         with patch("subprocess.check_output") as mock_subprocess:
 23 |             p = P.PyTesseract({})
 24 |             p.required = "3.02.02"
 25 |             mock_subprocess.return_value = """tesseract 3.02.01"""
 26 |             uptodate,ver = p._is_version_uptodate()
 27 |             assert (not uptodate)
 28 | 
 29 |     def test_version_major_older(self):
 30 |         with patch("subprocess.check_output") as mock_subprocess:
 31 |             p = P.PyTesseract({})
 32 |             p.required = "3.02.02"
 33 |             mock_subprocess.return_value = """tesseract 2.03.03"""
 34 |             uptodate,ver = p._is_version_uptodate()
 35 |             assert (not uptodate)
 36 | 
 37 |     @pytest.mark.skipif(os.name=='nt', reason='Does not work on Windows')
 38 |     def test_version_major_equal(self):
 39 |         with patch("subprocess.check_output") as mock_subprocess:
 40 |             p = P.PyTesseract({})
 41 |             p.required = "3.02.02"
 42 |             mock_subprocess.return_value = """tesseract 3.02.02"""
 43 |             uptodate,ver = p._is_version_uptodate()
 44 |             assert (uptodate)
 45 | 
 46 |     def test_version_major_newer(self):
 47 |         with patch("subprocess.check_output") as mock_subprocess:
 48 |             p = P.PyTesseract({})
 49 |             p.required = "3.02.02"
 50 | 
 51 |             mock_subprocess.return_value = """tesseract 4.01"""
 52 |             uptodate,ver = p._is_version_uptodate()
 53 |             assert (uptodate)
 54 | 
 55 |     def test_version_minor_newer(self):
 56 |         with patch("subprocess.check_output") as mock_subprocess:
 57 |             p = P.PyTesseract({})
 58 |             p.required = "3.01.02"
 59 | 
 60 |             mock_subprocess.return_value = """tesseract 3.02"""
 61 |             uptodate,ver = p._is_version_uptodate()
 62 |             assert (uptodate)
 63 | 
 64 | 
 65 |     def test_tesseract_presence(self, capsys):
 66 |         p = P.PyTesseract({})
 67 |         p.binary = "tesserac" # Misspell it and make sure we get an error
 68 |         with pytest.raises(SystemExit):
 69 |             p._is_version_uptodate()
 70 |         out, err = capsys.readouterr()
 71 |         assert p.msgs['TS_MISSING'] in out
 72 | 
 73 |     def test_tesseract_version(self, capsys):
 74 |         p = P.PyTesseract({})
 75 |         p.required = "100"
 76 |         with pytest.raises(SystemExit):
 77 |             p.make_hocr_from_pnms("")
 78 |         out, err = capsys.readouterr()
 79 |         assert p.msgs['TS_VERSION'] in out
 80 | 
 81 |     def test_tiff_file_check(self, capsys):
 82 |         p = P.PyTesseract({})
 83 |         with pytest.raises(SystemExit):
 84 |             p.make_hocr_from_pnm("DUMMY_NOTPRESENT.tiff")
 85 |         out, err = capsys.readouterr()
 86 |         assert p.msgs['TS_img_MISSING'] in out
 87 | 
 88 |     @patch('os.name')
 89 |     @patch('subprocess.check_output')
 90 |     def test_tesseract_version_nt(self, mock_subprocess, mock_os_name):
 91 |         """
 92 |             Stupid test because Windows Tesseract only returns 3.02 instead of 3.02.02
 93 |         """
 94 |         mock_os_name.__str__.return_value = 'nt'
 95 |         p = P.PyTesseract({})
 96 |         p.required = "3.02.02"
 97 | 
 98 |         mock_subprocess.return_value = """tesseract 3.02"""
 99 |         uptodate,ver = p._is_version_uptodate()
100 |         assert (uptodate)
101 | 
102 |     @patch('pypdfocr.pypdfocr_tesseract.PyTesseract._is_version_uptodate')
103 |     @patch('pypdfocr.pypdfocr_tesseract.os.name')
104 |     @patch('pypdfocr.pypdfocr_tesseract.os.path.exists')
105 |     def test_force_Nt(self, mock_os_path_exists, mock_os_name, mock_uptodate, capsys):
106 |         mock_os_name.__str__.return_value = 'nt'
107 |         p = P.PyTesseract({})
108 |         assert ('tesseract.exe' in p.binary)
109 | 
110 |         mock_os_path_exists.return_value = True 
111 |         mock_uptodate.return_value = (True,"")
112 |         # force a bad tesseract on windows
113 |         p.binary = "blah"
114 |         print("here")
115 |         with pytest.raises(SystemExit):
116 |             p.make_hocr_from_pnm('blah.tiff')
117 | 
118 |     @patch('pypdfocr.pypdfocr_tesseract.subprocess.call')
119 |     @patch('pypdfocr.pypdfocr_tesseract.PyTesseract._is_version_uptodate')
120 |     @patch('pypdfocr.pypdfocr_tesseract.os.name')
121 |     @patch('pypdfocr.pypdfocr_tesseract.os.path.exists')
122 |     def test_tesseract_fail(self, mock_os_path_exists, mock_os_name, mock_uptodate, mock_subprocess_call,capsys):
123 |         """
124 |             Get all the checks past and make sure we report the case where tesseract returns a non-zero status
125 |         """
126 |         mock_os_name.__str__.return_value = 'nt'
127 |         p = P.PyTesseract({})
128 |         assert ('tesseract.exe' in p.binary)
129 | 
130 |         mock_os_path_exists.return_value = True 
131 |         mock_uptodate.return_value = (True,"")
132 |         mock_subprocess_call.return_value = -1
133 |         with pytest.raises(SystemExit):
134 |             p.make_hocr_from_pnm('blah.tiff')
135 | 
136 |         out, err = capsys.readouterr()
137 |         assert p.msgs['TS_FAILED'] in out
138 | 
139 | 


--------------------------------------------------------------------------------
/test/test_watcher.py:
--------------------------------------------------------------------------------
 1 | #from pypdfocr import PyPDFOCR as P
 2 | import pypdfocr.pypdfocr_watcher as P
 3 | import pytest
 4 | 
 5 | import evernote.api.client
 6 | import evernote.edam.type.ttypes as Types
 7 | import hashlib
 8 | import time
 9 | import os
10 | from collections import namedtuple
11 | 
12 | from mock import patch, call
13 | 
14 | class TestWatching:
15 | 
16 | 
17 |     filenames = [   ("test_recipe.pdf", "test_recipe.pdf"),
18 |                     (os.path.join("..","test_recipe.pdf"), os.path.join("..","test_recipe.pdf")),
19 |                     (os.path.join("/", "Volumes","Media", "test_recipe.pdf"), os.path.join("/","Volumes", "Media", "test_recipe.pdf")),
20 |                     (os.path.join("/", "Volumes", "Media", "test recipe.pdf"), os.path.join("/","Volumes","Media","test_recipe.pdf")),
21 |                     (os.path.join("..","V olumes","Media", "test recipe.pdf"), os.path.join("..", "V olumes","Media", "test_recipe.pdf")),
22 |                 ]
23 | 
24 |     @patch('shutil.move')
25 |     @pytest.mark.parametrize(("filename, expected"), filenames)
26 |     def test_rename(self, mock_move, filename, expected):
27 |     
28 |         if expected == None:
29 |             expected = filename
30 | 
31 |         p = P.PyPdfWatcher('temp',{})
32 | 
33 |         # First, test code that does not move original
34 |         ret = p.rename_file_with_spaces(filename)
35 |         assert (ret==expected)
36 | 
37 |     def test_check_for_new_pdf(self):
38 |     
39 |         p = P.PyPdfWatcher('temp', {})
40 |         p.check_for_new_pdf("blah_ocr.pdf")
41 |         assert("blah_ocr.pdf" not in p.events)
42 |         p.check_for_new_pdf("blah.pdf")
43 |         assert("blah.pdf" in p.events)
44 |         p.events['blah.pdf'] = -1
45 |         p.check_for_new_pdf("blah.pdf")
46 |         assert("blah.pdf" not in p.events)
47 |         p.check_for_new_pdf("blah.pdf")
48 |         time.sleep(p.scan_interval+1)
49 |         p.check_for_new_pdf("blah.pdf")
50 |         assert(p.events['blah.pdf']-time.time() <=1) # Check that time stamp was updated
51 | 
52 |     def test_events(self):
53 |         p = P.PyPdfWatcher('temp', {})
54 | 
55 |         event = namedtuple('event', 'src_path, dest_path')
56 | 
57 |         p.on_created(event(src_path='temp_recipe.pdf', dest_path=None))
58 |         assert('temp_recipe.pdf' in p.events)
59 | 
60 |         p.on_moved(event(src_path=None, dest_path='temp_recipe2.pdf'))
61 |         assert('temp_recipe2.pdf' in p.events)
62 | 
63 |         p.on_modified(event(src_path='temp_recipe3.pdf', dest_path=None))
64 |         assert('temp_recipe3.pdf' in p.events)
65 | 
66 |     def test_check_queue(self):
67 |         p = P.PyPdfWatcher('temp', {})
68 |         now = time.time()
69 |         p.events['blah.pdf'] = now
70 |         f = p.check_queue()
71 |         assert (not f)
72 |         assert ('blah.pdf' in p.events)
73 |         time.sleep(p.scan_interval+1)
74 |         f = p.check_queue()
75 |         assert (f=='blah.pdf')
76 |         assert ('blah.pdf' in p.events)
77 |         assert (p.events['blah.pdf'] == -1)
78 |         f = p.check_queue()
79 |         assert ('blah.pdf' not in p.events)
80 | 
81 | 


--------------------------------------------------------------------------------