├── .appveyor.yml
├── MIT-LICENSE
├── README.md
├── codecov.yml
├── composer.json
└── src
├── Command.php
├── FeatureNotAvailableException.php
├── FriendlyErrors.php
├── ImageNotFoundException.php
├── NoWritePermissionsForOutputFile.php
├── Option.php
├── Process.php
├── TesseractNotFoundException.php
├── TesseractOCR.php
├── TesseractOcrException.php
└── UnsuccessfulCommandException.php
/.appveyor.yml:
--------------------------------------------------------------------------------
1 | ---
2 | build: false
3 |
4 | install:
5 | - ps: Set-Service wuauserv -StartupType Manual
6 | - choco install php
7 | - choco install capture2text --version 3.9
8 | - choco install composer
9 | - refreshenv
10 | - cd %APPVEYOR_BUILD_FOLDER%
11 | - composer install
12 |
13 | test_script:
14 | - php tests\run.php unit e2e
15 |
--------------------------------------------------------------------------------
/MIT-LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2012-2021 Thiago Alessio Pereira
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tesseract OCR for PHP
2 |
3 | A wrapper to work with Tesseract OCR inside PHP.
4 |
5 | [![CI][ci_badge]][ci]
6 | [![AppVeyor][appveyor_badge]][appveyor]
7 | [![Codacy][codacy_badge]][codacy]
8 | [![Test Coverage][test_coverage_badge]][test_coverage]
9 |
10 | [![Latest Stable Version][stable_version_badge]][packagist]
11 | [![Total Downloads][total_downloads_badge]][packagist]
12 | [![Monthly Downloads][monthly_downloads_badge]][packagist]
13 |
14 | ## Installation
15 |
16 | Via [Composer][]:
17 |
18 | $ composer require thiagoalessio/tesseract_ocr
19 |
20 | :bangbang: **This library depends on [Tesseract OCR][], version _3.02_ or later.**
21 |
22 |
23 |
24 | ### ![][windows_icon] Note for Windows users
25 |
26 | There are [many ways][tesseract_installation_on_windows] to install
27 | [Tesseract OCR][] on your system, but if you just want something quick to
28 | get up and running, I recommend installing the [Capture2Text][] package with
29 | [Chocolatey][].
30 |
31 | choco install capture2text --version 3.9
32 |
33 | :warning: Recent versions of [Capture2Text][] stopped shipping the `tesseract` binary.
34 |
35 |
36 |
37 | ### ![][macos_icon] Note for macOS users
38 |
39 | With [MacPorts][] you can install support for individual languages, like so:
40 |
41 | $ sudo port install tesseract-
42 |
43 | But that is not possible with [Homebrew][]. It comes only with **English** support
44 | by default, so if you intend to use it for other language, the quickest solution
45 | is to install them all:
46 |
47 | $ brew install tesseract tesseract-lang
48 |
49 |
50 |
51 | ## Usage
52 |
53 | ### Basic usage
54 |
55 |
56 |
57 | ```php
58 | use thiagoalessio\TesseractOCR\TesseractOCR;
59 | echo (new TesseractOCR('text.png'))
60 | ->run();
61 | ```
62 |
63 | ```
64 | The quick brown fox
65 | jumps over
66 | the lazy dog.
67 | ```
68 |
69 |
70 |
71 | ### Other languages
72 |
73 |
74 |
75 | ```php
76 | use thiagoalessio\TesseractOCR\TesseractOCR;
77 | echo (new TesseractOCR('german.png'))
78 | ->lang('deu')
79 | ->run();
80 | ```
81 |
82 | ```
83 | Bülowstraße
84 | ```
85 |
86 |
87 |
88 | ### Multiple languages
89 |
90 |
91 |
92 | ```php
93 | use thiagoalessio\TesseractOCR\TesseractOCR;
94 | echo (new TesseractOCR('mixed-languages.png'))
95 | ->lang('eng', 'jpn', 'spa')
96 | ->run();
97 | ```
98 |
99 | ```
100 | I eat すし y Pollo
101 | ```
102 |
103 |
104 |
105 | ### Inducing recognition
106 |
107 |
108 |
109 | ```php
110 | use thiagoalessio\TesseractOCR\TesseractOCR;
111 | echo (new TesseractOCR('8055.png'))
112 | ->allowlist(range('A', 'Z'))
113 | ->run();
114 | ```
115 |
116 | ```
117 | BOSS
118 | ```
119 |
120 |
121 |
122 | ### Breaking CAPTCHAs
123 |
124 | Yes, I know some of you might want to use this library for the *noble* purpose
125 | of breaking CAPTCHAs, so please take a look at this comment:
126 |
127 |
128 |
129 | ## API
130 |
131 | ### run
132 |
133 | Executes a `tesseract` command, optionally receiving an integer as `timeout`,
134 | in case you experience stalled tesseract processes.
135 |
136 | ```php
137 | $ocr = new TesseractOCR();
138 | $ocr->run();
139 | ```
140 | ```php
141 | $ocr = new TesseractOCR();
142 | $timeout = 500;
143 | $ocr->run($timeout);
144 | ```
145 |
146 | ### image
147 |
148 | Define the path of an image to be recognized by `tesseract`.
149 |
150 | ```php
151 | $ocr = new TesseractOCR();
152 | $ocr->image('/path/to/image.png');
153 | $ocr->run();
154 | ```
155 |
156 | ### imageData
157 |
158 | Set the image to be recognized by `tesseract` from a string, with its size.
159 | This can be useful when dealing with files that are already loaded in memory.
160 | You can easily retrieve the image data and size of an image object :
161 | ```php
162 | //Using Imagick
163 | $data = $img->getImageBlob();
164 | $size = $img->getImageLength();
165 | //Using GD
166 | ob_start();
167 | // Note that you can use any format supported by tesseract
168 | imagepng($img, null, 0);
169 | $size = ob_get_length();
170 | $data = ob_get_clean();
171 |
172 | $ocr = new TesseractOCR();
173 | $ocr->imageData($data, $size);
174 | $ocr->run();
175 | ```
176 |
177 | ### executable
178 |
179 | Define a custom location of the `tesseract` executable,
180 | if by any reason it is not present in the `$PATH`.
181 |
182 | ```php
183 | echo (new TesseractOCR('img.png'))
184 | ->executable('/path/to/tesseract')
185 | ->run();
186 | ```
187 |
188 | ### version
189 |
190 | Returns the current version of `tesseract`.
191 |
192 | ```php
193 | echo (new TesseractOCR())->version();
194 | ```
195 |
196 | ### availableLanguages
197 |
198 | Returns a list of available languages/scripts.
199 |
200 | ```php
201 | foreach((new TesseractOCR())->availableLanguages() as $lang) echo $lang;
202 | ```
203 |
204 | __More info:__
205 |
206 | ### tessdataDir
207 |
208 | Specify a custom location for the tessdata directory.
209 |
210 | ```php
211 | echo (new TesseractOCR('img.png'))
212 | ->tessdataDir('/path')
213 | ->run();
214 | ```
215 |
216 | ### userWords
217 |
218 | Specify the location of user words file.
219 |
220 | This is a plain text file containing a list of words that you want to be
221 | considered as a normal dictionary words by `tesseract`.
222 |
223 | Useful when dealing with contents that contain technical terminology, jargon,
224 | etc.
225 |
226 | ```
227 | $ cat /path/to/user-words.txt
228 | foo
229 | bar
230 | ```
231 |
232 | ```php
233 | echo (new TesseractOCR('img.png'))
234 | ->userWords('/path/to/user-words.txt')
235 | ->run();
236 | ```
237 |
238 | ### userPatterns
239 |
240 | Specify the location of user patterns file.
241 |
242 | If the contents you are dealing with have known patterns, this option can help
243 | a lot tesseract's recognition accuracy.
244 |
245 | ```
246 | $ cat /path/to/user-patterns.txt'
247 | 1-\d\d\d-GOOG-441
248 | www.\n\\\*.com
249 | ```
250 |
251 | ```php
252 | echo (new TesseractOCR('img.png'))
253 | ->userPatterns('/path/to/user-patterns.txt')
254 | ->run();
255 | ```
256 |
257 | ### lang
258 |
259 | Define one or more languages to be used during the recognition.
260 | A complete list of available languages can be found at:
261 |
262 |
263 | __Tip from [@daijiale][]:__ Use the combination `->lang('chi_sim', 'chi_tra')`
264 | for proper recognition of Chinese.
265 |
266 | ```php
267 | echo (new TesseractOCR('img.png'))
268 | ->lang('lang1', 'lang2', 'lang3')
269 | ->run();
270 | ```
271 |
272 | ### psm
273 |
274 | Specify the Page Segmentation Method, which instructs `tesseract` how to
275 | interpret the given image.
276 |
277 | __More info:__
278 |
279 | ```php
280 | echo (new TesseractOCR('img.png'))
281 | ->psm(6)
282 | ->run();
283 | ```
284 |
285 | ### oem
286 |
287 | Specify the OCR Engine Mode. (see `tesseract --help-oem`)
288 |
289 | ```php
290 | echo (new TesseractOCR('img.png'))
291 | ->oem(2)
292 | ->run();
293 | ```
294 |
295 | ### dpi
296 |
297 | Specify the image DPI. It is useful if your image does not contain this information in its metadata.
298 |
299 | ```php
300 | echo (new TesseractOCR('img.png'))
301 | ->dpi(300)
302 | ->run();
303 | ```
304 |
305 | ### allowlist
306 |
307 | This is a shortcut for `->config('tessedit_char_whitelist', 'abcdef....')`.
308 |
309 | ```php
310 | echo (new TesseractOCR('img.png'))
311 | ->allowlist(range('a', 'z'), range(0, 9), '-_@')
312 | ->run();
313 | ```
314 |
315 | ### configFile
316 |
317 | Specify a config file to be used. It can either be the path to your own
318 | config file or the name of one of the predefined config files:
319 |
320 |
321 | ```php
322 | echo (new TesseractOCR('img.png'))
323 | ->configFile('hocr')
324 | ->run();
325 | ```
326 |
327 | ### setOutputFile
328 |
329 | Specify an Outputfile to be used. Be aware: If you set an outputfile then
330 | the option `withoutTempFiles` is ignored.
331 | Tempfiles are written (and deleted) even if `withoutTempFiles = true`.
332 |
333 | In combination with `configFile` you are able to get the `hocr`, `tsv` or
334 | `pdf` files.
335 |
336 | ```php
337 | echo (new TesseractOCR('img.png'))
338 | ->configFile('pdf')
339 | ->setOutputFile('/PATH_TO_MY_OUTPUTFILE/searchable.pdf')
340 | ->run();
341 | ```
342 |
343 | ### digits
344 |
345 | Shortcut for `->configFile('digits')`.
346 |
347 | ```php
348 | echo (new TesseractOCR('img.png'))
349 | ->digits()
350 | ->run();
351 | ```
352 |
353 | ### hocr
354 |
355 | Shortcut for `->configFile('hocr')`.
356 |
357 | ```php
358 | echo (new TesseractOCR('img.png'))
359 | ->hocr()
360 | ->run();
361 | ```
362 |
363 | ### pdf
364 |
365 | Shortcut for `->configFile('pdf')`.
366 |
367 | ```php
368 | echo (new TesseractOCR('img.png'))
369 | ->pdf()
370 | ->run();
371 | ```
372 |
373 | ### quiet
374 |
375 | Shortcut for `->configFile('quiet')`.
376 |
377 | ```php
378 | echo (new TesseractOCR('img.png'))
379 | ->quiet()
380 | ->run();
381 | ```
382 |
383 | ### tsv
384 |
385 | Shortcut for `->configFile('tsv')`.
386 |
387 | ```php
388 | echo (new TesseractOCR('img.png'))
389 | ->tsv()
390 | ->run();
391 | ```
392 |
393 | ### txt
394 |
395 | Shortcut for `->configFile('txt')`.
396 |
397 | ```php
398 | echo (new TesseractOCR('img.png'))
399 | ->txt()
400 | ->run();
401 | ```
402 |
403 | ### tempDir
404 |
405 | Define a custom directory to store temporary files generated by tesseract.
406 | Make sure the directory actually exists and the user running `php` is allowed
407 | to write in there.
408 |
409 | ```php
410 | echo (new TesseractOCR('img.png'))
411 | ->tempDir('./my/custom/temp/dir')
412 | ->run();
413 | ```
414 |
415 | ### withoutTempFiles
416 |
417 | Specify that `tesseract` should output the recognized text without writing to temporary files.
418 | The data is gathered from the standard output of `tesseract` instead.
419 |
420 | ```php
421 | echo (new TesseractOCR('img.png'))
422 | ->withoutTempFiles()
423 | ->run();
424 | ```
425 |
426 | ### Other options
427 |
428 | Any configuration option offered by Tesseract can be used like that:
429 |
430 | ```php
431 | echo (new TesseractOCR('img.png'))
432 | ->config('config_var', 'value')
433 | ->config('other_config_var', 'other value')
434 | ->run();
435 | ```
436 |
437 | Or like that:
438 |
439 | ```php
440 | echo (new TesseractOCR('img.png'))
441 | ->configVar('value')
442 | ->otherConfigVar('other value')
443 | ->run();
444 | ```
445 |
446 | __More info:__
447 |
448 | ### Thread-limit
449 |
450 | Sometimes, it may be useful to limit the number of threads that tesseract is
451 | allowed to use (e.g. in [this case](https://github.com/tesseract-ocr/tesseract/issues/898)).
452 | Set the maxmium number of threads as param for the `run` function:
453 |
454 | ```php
455 | echo (new TesseractOCR('img.png'))
456 | ->threadLimit(1)
457 | ->run();
458 | ```
459 |
460 | ## How to contribute
461 |
462 | You can contribute to this project by:
463 |
464 | * Opening an [Issue][] if you found a bug or wish to propose a new feature;
465 | * Placing a [Pull Request][] with code that fix a bug, missing/wrong documentation
466 | or implement a new feature;
467 |
468 | Just make sure you take a look at our [Code of Conduct][] and [Contributing][]
469 | instructions.
470 |
471 | ## License
472 |
473 | tesseract-ocr-for-php is released under the [MIT License][].
474 |
475 |
476 | Made with
in Berlin
477 |
478 | [ci_badge]: https://github.com/thiagoalessio/tesseract-ocr-for-php/workflows/CI/badge.svg?event=push&branch=main
479 | [ci]: https://github.com/thiagoalessio/tesseract-ocr-for-php/actions?query=workflow%3ACI
480 | [appveyor_badge]: https://ci.appveyor.com/api/projects/status/xwy5ls0798iwcim3/branch/main?svg=true
481 | [appveyor]: https://ci.appveyor.com/project/thiagoalessio/tesseract-ocr-for-php/branch/main
482 | [codacy_badge]: https://app.codacy.com/project/badge/Grade/a81aa10012874f23a57df5b492d835f2
483 | [codacy]: https://app.codacy.com/gh/thiagoalessio/tesseract-ocr-for-php/dashboard
484 | [test_coverage_badge]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php/branch/main/graph/badge.svg?token=Y0VnrqiSIf
485 | [test_coverage]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php
486 | [stable_version_badge]: https://img.shields.io/packagist/v/thiagoalessio/tesseract_ocr.svg
487 | [packagist]: https://packagist.org/packages/thiagoalessio/tesseract_ocr
488 | [total_downloads_badge]: https://img.shields.io/packagist/dt/thiagoalessio/tesseract_ocr.svg
489 | [monthly_downloads_badge]: https://img.shields.io/packagist/dm/thiagoalessio/tesseract_ocr.svg
490 | [Tesseract OCR]: https://github.com/tesseract-ocr/tesseract
491 | [Composer]: http://getcomposer.org/
492 | [windows_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/windows-18.svg
493 | [macos_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/apple-18.svg
494 | [tesseract_installation_on_windows]: https://github.com/tesseract-ocr/tesseract/wiki#windows
495 | [Capture2Text]: https://chocolatey.org/packages/capture2text
496 | [Chocolatey]: https://chocolatey.org
497 | [MacPorts]: https://www.macports.org
498 | [Homebrew]: https://brew.sh
499 | [@daijiale]: https://github.com/daijiale
500 | [HOCR]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#hocr-output
501 | [TSV]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#tsv-output-currently-available-in-305-dev-in-master-branch-on-github
502 | [Issue]: https://github.com/thiagoalessio/tesseract-ocr-for-php/issues
503 | [Pull Request]: https://github.com/thiagoalessio/tesseract-ocr-for-php/pulls
504 | [Code of Conduct]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CODE_OF_CONDUCT.md
505 | [Contributing]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CONTRIBUTING.md
506 | [MIT License]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/MIT-LICENSE
507 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | fixes:
2 | - "/home/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
3 | - "/Users/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::"
4 | - "C:\\projects\\tesseract-ocr-for-php\\::"
5 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "thiagoalessio/tesseract_ocr",
3 | "description": "A wrapper to work with Tesseract OCR inside PHP.",
4 | "version": "2.13.0",
5 | "type": "library",
6 | "keywords": ["Tesseract", "OCR", "text recognition"],
7 | "license": "MIT",
8 | "authors": [
9 | {
10 | "name": "thiagoalessio",
11 | "email": "thiagoalessio@me.com"
12 | }
13 | ],
14 | "support": {
15 | "issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues",
16 | "irc": "irc://irc.freenode.net/tesseract-ocr-for-php",
17 | "source": "https://github.com/thiagoalessio/tesseract-ocr-for-php"
18 | },
19 | "require": {
20 | "php": "^5.5 || ^7.0 || ^8.0"
21 | },
22 | "require-dev": {
23 | "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0"
24 | },
25 | "autoload": {
26 | "psr-4": {
27 | "thiagoalessio\\TesseractOCR\\": "src/"
28 | }
29 | },
30 | "autoload-dev": {
31 | "psr-4": {
32 | "thiagoalessio\\TesseractOCR\\Tests\\": "tests/"
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/Command.php:
--------------------------------------------------------------------------------
1 | image = $image;
19 | $this->outputFile = $outputFile;
20 | }
21 |
22 | public function build() { return "$this"; }
23 |
24 | public function __toString()
25 | {
26 | $cmd = array();
27 | if ($this->threadLimit) $cmd[] = "OMP_THREAD_LIMIT={$this->threadLimit}";
28 | $cmd[] = self::escape($this->executable);
29 | $cmd[] = $this->useFileAsInput ? self::escape($this->image) : "-";
30 | $cmd[] = $this->useFileAsOutput ? self::escape($this->getOutputFile(false)) : "-";
31 |
32 | $version = $this->getTesseractVersion();
33 |
34 | foreach ($this->options as $option) {
35 | $cmd[] = is_callable($option) ? $option($version) : "$option";
36 | }
37 | if ($this->configFile) $cmd[] = $this->configFile;
38 |
39 | return join(' ', $cmd);
40 | }
41 |
42 | public function getOutputFile($withExt=true)
43 | {
44 | if (!$this->outputFile)
45 | $this->outputFile = $this->getTempDir()
46 | .DIRECTORY_SEPARATOR
47 | .basename(tempnam($this->getTempDir(), 'ocr'));
48 | if (!$withExt) return $this->outputFile;
49 |
50 | $hasCustomExt = array('hocr', 'tsv', 'pdf');
51 | $ext = in_array($this->configFile, $hasCustomExt) ? $this->configFile : 'txt';
52 | return "{$this->outputFile}.{$ext}";
53 | }
54 |
55 | public function getTempDir()
56 | {
57 | return $this->tempDir ?: sys_get_temp_dir();
58 | }
59 |
60 | public function getTesseractVersion()
61 | {
62 | exec(self::escape($this->executable).' --version 2>&1', $output);
63 | $outputParts = explode(' ', $output[0]);
64 | return $outputParts[1];
65 | }
66 |
67 | public function getAvailableLanguages()
68 | {
69 | exec(self::escape($this->executable) . ' --list-langs 2>&1', $output);
70 | array_shift($output);
71 | sort($output);
72 | return $output;
73 | }
74 |
75 | public static function escape($str)
76 | {
77 | $charlist = strtoupper(substr(PHP_OS, 0, 3)) == 'WIN' ? '$"`' : '$"\\`';
78 | return '"'.addcslashes($str, $charlist).'"';
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/FeatureNotAvailableException.php:
--------------------------------------------------------------------------------
1 | NUL 2>&1'
25 | : 'type '.Command::escape($executable).' > /dev/null 2>&1';
26 | system($cmd, $exitCode);
27 |
28 | if ($exitCode == 0) return;
29 |
30 | $currentPath = getenv('PATH');
31 | $msg = array();
32 | $msg[] = "Error! The command \"$executable\" was not found.";
33 | $msg[] = '';
34 | $msg[] = 'Make sure you have Tesseract OCR installed on your system:';
35 | $msg[] = 'https://github.com/tesseract-ocr/tesseract';
36 | $msg[] = '';
37 | $msg[] = "The current \$PATH is $currentPath";
38 | $msg = join(PHP_EOL, $msg);
39 |
40 | throw new TesseractNotFoundException($msg);
41 | }
42 |
43 | public static function checkCommandExecution($command, $stdout, $stderr)
44 | {
45 | if ($command->useFileAsOutput) {
46 | $file = $command->getOutputFile();
47 | if (file_exists($file) && filesize($file) > 0) return;
48 | }
49 |
50 | if (!$command->useFileAsOutput && $stdout) {
51 | return;
52 | }
53 |
54 | $msg = array();
55 | $msg[] = 'Error! The command did not produce any output.';
56 | $msg[] = '';
57 | $msg[] = 'Generated command:';
58 | $msg[] = "$command";
59 | $msg[] = '';
60 | $msg[] = 'Returned message:';
61 | $arrayStderr = explode(PHP_EOL, $stderr);
62 | array_pop($arrayStderr);
63 | $msg = array_merge($msg, $arrayStderr);
64 | $msg = join(PHP_EOL, $msg);
65 |
66 | throw new UnsuccessfulCommandException($msg);
67 | }
68 |
69 | public static function checkProcessCreation($processHandle, $command)
70 | {
71 | if ($processHandle !== FALSE) return;
72 |
73 | $msg = array();
74 | $msg[] = 'Error! The command could not be launched.';
75 | $msg[] = '';
76 | $msg[] = 'Generated command:';
77 | $msg[] = "$command";
78 | $msg = join(PHP_EOL, $msg);
79 |
80 | throw new UnsuccessfulCommandException($msg);
81 | }
82 |
83 | public static function checkTesseractVersion($expected, $action, $command)
84 | {
85 | $actual = $command->getTesseractVersion();
86 |
87 | if ($actual[0] === 'v')
88 | $actual = substr($actual, 1);
89 |
90 | if (version_compare($actual, $expected, ">=")) return;
91 |
92 | $msg = array();
93 | $msg[] = "Error! $action is not available this tesseract version";
94 | $msg[] = "Required version is $expected, actual version is $actual";
95 | $msg[] = '';
96 | $msg[] = 'Generated command:';
97 | $msg[] = "$command";
98 | $msg = join(PHP_EOL, $msg);
99 |
100 | throw new FeatureNotAvailableException($msg);
101 | }
102 |
103 | public static function checkWritePermissions($path)
104 | {
105 | if (!is_dir(dirname($path))) mkdir(dirname($path));
106 | $writableDirectory = is_writable(dirname($path));
107 | $writableFile = true;
108 | if (file_exists($path)) $writableFile = is_writable($path);
109 | if ($writableFile && $writableDirectory) return;
110 |
111 | $msg = array();
112 | $msg[] = "Error! No permission to write to $path";
113 | $msg[] = "Make sure you have the right outputFile and permissions "
114 | ."to write to the folder";
115 | $msg[] = '';
116 | $msg = join(PHP_EOL, $msg);
117 |
118 | throw new NoWritePermissionsForOutputFile($msg);
119 | }
120 | }
121 |
--------------------------------------------------------------------------------
/src/ImageNotFoundException.php:
--------------------------------------------------------------------------------
1 | =') ? '-' : '')."-psm $psm";
10 | };
11 | }
12 |
13 | public static function oem($oem)
14 | {
15 | return function($version) use ($oem) {
16 | Option::checkMinVersion('3.05', $version, 'oem');
17 | return "--oem $oem";
18 | };
19 | }
20 |
21 | public static function dpi($dpi)
22 | {
23 | return function() use ($dpi) {
24 | return "--dpi $dpi";
25 | };
26 | }
27 |
28 | public static function userWords($path)
29 | {
30 | return function($version) use ($path) {
31 | Option::checkMinVersion('3.04', $version, 'user-words');
32 | return '--user-words "'.addcslashes($path, '\\"').'"';
33 | };
34 | }
35 |
36 | public static function userPatterns($path)
37 | {
38 | return function($version) use ($path) {
39 | Option::checkMinVersion('3.04', $version, 'user-patterns');
40 | return '--user-patterns "'.addcslashes($path, '\\"').'"';
41 | };
42 | }
43 |
44 | public static function tessdataDir($path)
45 | {
46 | return function() use ($path) {
47 | return '--tessdata-dir "'.addcslashes($path, '\\"').'"';
48 | };
49 | }
50 |
51 | public static function lang()
52 | {
53 | $languages = func_get_args();
54 | return function() use ($languages) {
55 | return '-l '.join('+', $languages);
56 | };
57 | }
58 |
59 | public static function config($var, $value)
60 | {
61 | return function() use($var, $value) {
62 | $snakeCase = function($str) {
63 | return strtolower(preg_replace('/([A-Z])+/', '_$1', $str));
64 | };
65 | $pair = $snakeCase($var).'='.$value;
66 | return '-c "'.addcslashes($pair, '\\"').'"';
67 | };
68 | }
69 |
70 | public static function checkMinVersion($minVersion, $currVersion, $option)
71 | {
72 | $minVersion = preg_replace('/^v/', '', $minVersion);
73 | $currVersion = preg_replace('/^v/', '', $currVersion);
74 | if (!version_compare($currVersion, $minVersion, '<')) return;
75 | $msg = "$option option is only available on Tesseract $minVersion or later.";
76 | $msg.= PHP_EOL."Your version of Tesseract is $currVersion";
77 | throw new \Exception($msg);
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/Process.php:
--------------------------------------------------------------------------------
1 | startTime = microtime(true);
14 | $streamDescriptors = [
15 | array("pipe", "r"),
16 | array("pipe", "w"),
17 | array("pipe", "w")
18 | ];
19 | $this->handle = proc_open($command, $streamDescriptors, $pipes, NULL, NULL, ["bypass_shell" => true]);
20 | list($this->stdin, $this->stdout, $this->stderr) = $pipes;
21 |
22 | FriendlyErrors::checkProcessCreation($this->handle, $command);
23 |
24 | //This is can avoid deadlock on some cases (when stderr buffer is filled up before writing to stdout and vice-versa)
25 | stream_set_blocking($this->stdout, 0);
26 | stream_set_blocking($this->stderr, 0);
27 | }
28 |
29 | public function write($data, $len)
30 | {
31 | $total = 0;
32 | do
33 | {
34 | $res = fwrite($this->stdin, substr($data, $total));
35 | } while($res && $total += $res < $len);
36 | return $total === $len;
37 | }
38 |
39 |
40 | public function wait($timeout = 0)
41 | {
42 | $running = true;
43 | $data = ["out" => "", "err" => ""];
44 | while (($running === true) && !$this->hasTimedOut($timeout))
45 | {
46 | $data["out"] .= fread($this->stdout, 8192);
47 | $data["err"] .= fread($this->stderr, 8192);
48 | $procInfo = proc_get_status($this->handle);
49 | $running = $procInfo["running"];
50 | if ($running) {
51 | usleep(1000); // Sleep 1ms to yield CPU time
52 | }
53 | }
54 | return $data;
55 | }
56 |
57 | public function close()
58 | {
59 | $this->closeStream($this->stdin);
60 | $this->closeStream($this->stdout);
61 | $this->closeStream($this->stderr);
62 | return proc_close($this->handle);
63 | }
64 |
65 | public function closeStdin()
66 | {
67 | $this->closeStream($this->stdin);
68 | }
69 |
70 | private function hasTimedOut($timeout)
71 | {
72 | return (($timeout > 0) && ($this->startTime + $timeout < microtime(true)));
73 | }
74 |
75 | private function closeStream(&$stream)
76 | {
77 | if ($stream !== NULL)
78 | {
79 | fclose($stream);
80 | $stream = NULL;
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/TesseractNotFoundException.php:
--------------------------------------------------------------------------------
1 | command = $command ?: new Command;
15 | $this->image("$image");
16 | }
17 |
18 | public function run($timeout = 0)
19 | {
20 | try {
21 | if ($this->outputFile !== null) {
22 | FriendlyErrors::checkWritePermissions($this->outputFile);
23 | $this->command->useFileAsOutput = true;
24 | }
25 |
26 | FriendlyErrors::checkTesseractPresence($this->command->executable);
27 | if ($this->command->useFileAsInput) {
28 | FriendlyErrors::checkImagePath($this->command->image);
29 | }
30 |
31 | $process = new Process("{$this->command}");
32 |
33 | if (!$this->command->useFileAsInput) {
34 | $process->write($this->command->image, $this->command->imageSize);
35 | $process->closeStdin();
36 | }
37 | $output = $process->wait($timeout);
38 |
39 | FriendlyErrors::checkCommandExecution($this->command, $output["out"], $output["err"]);
40 | }
41 | catch (TesseractOcrException $e) {
42 | if ($this->command->useFileAsOutput) $this->cleanTempFiles();
43 | throw $e;
44 | }
45 |
46 | if ($this->command->useFileAsOutput) {
47 | $text = file_get_contents($this->command->getOutputFile());
48 |
49 | if ($this->outputFile !== null) {
50 | rename($this->command->getOutputFile(), $this->outputFile);
51 | }
52 |
53 | $this->cleanTempFiles();
54 | }
55 | else
56 | $text = $output["out"];
57 |
58 | return trim($text, " \t\n\r\0\x0A\x0B\x0C");
59 | }
60 |
61 | public function imageData($image, $size)
62 | {
63 | FriendlyErrors::checkTesseractVersion("3.03-rc1", "Reading image data from stdin", $this->command);
64 | $this->command->useFileAsInput = false;
65 | $this->command->image = $image;
66 | $this->command->imageSize = $size;
67 | return $this;
68 | }
69 |
70 | public function withoutTempFiles()
71 | {
72 | FriendlyErrors::checkTesseractVersion("3.03-rc1", "Writing to stdout (without using temp files)", $this->command);
73 | $this->command->useFileAsOutput = false;
74 | return $this;
75 | }
76 |
77 | public function image($image)
78 | {
79 | $this->command->image = $image;
80 | return $this;
81 | }
82 |
83 | public function executable($executable)
84 | {
85 | FriendlyErrors::checkTesseractPresence($executable);
86 | $this->command->executable = $executable;
87 | return $this;
88 | }
89 |
90 | public function configFile($configFile)
91 | {
92 | $this->command->configFile = $configFile;
93 | return $this;
94 | }
95 |
96 | public function tempDir($tempDir)
97 | {
98 | $this->command->tempDir = $tempDir;
99 | return $this;
100 | }
101 |
102 | public function threadLimit($limit)
103 | {
104 | $this->command->threadLimit = $limit;
105 | return $this;
106 | }
107 |
108 | // @deprecated
109 | public function format($fmt) { return $this->configFile($fmt); }
110 |
111 | public function setOutputFile($path) {
112 | $this->outputFile = $path;
113 | return $this;
114 | }
115 |
116 | public function allowlist()
117 | {
118 | $concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
119 | $allowlist = join('', array_map($concat, func_get_args()));
120 | $this->command->options[] = Option::config('tessedit_char_whitelist', $allowlist);
121 | return $this;
122 | }
123 |
124 | public function whitelist()
125 | {
126 | $warningMsg = 'Notice: whitelist is deprecated, use allowlist instead.';
127 | trigger_error($warningMsg, E_USER_NOTICE);
128 |
129 | $concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; };
130 | $allowlist = join('', array_map($concat, func_get_args()));
131 | return $this->allowlist($allowlist);
132 | }
133 |
134 | public function version()
135 | {
136 | return $this->command->getTesseractVersion();
137 | }
138 |
139 | public function availableLanguages()
140 | {
141 | return $this->command->getAvailableLanguages();
142 | }
143 |
144 | public function __call($method, $args)
145 | {
146 | if ($this->isConfigFile($method)) return $this->configFile($method);
147 | if ($this->isOption($method)) {
148 | $option = $this->getOptionClassName().'::'.$method;
149 | $this->command->options[] = call_user_func_array($option, $args);
150 | return $this;
151 | }
152 | $arg = empty($args) ? null : $args[0];
153 | $this->command->options[] = Option::config($method, $arg);
154 | return $this;
155 | }
156 |
157 | private function isConfigFile($name)
158 | {
159 | return in_array($name, array('digits', 'hocr', 'pdf', 'quiet', 'tsv', 'txt'));
160 | }
161 |
162 | private function isOption($name)
163 | {
164 | return in_array($name, get_class_methods($this->getOptionClassName()));
165 | }
166 |
167 | private function getOptionClassName()
168 | {
169 | return __NAMESPACE__.'\\Option';
170 | }
171 |
172 | private function cleanTempFiles()
173 | {
174 | if (file_exists($this->command->getOutputFile(false))) {
175 | unlink($this->command->getOutputFile(false));
176 | }
177 | if (file_exists($this->command->getOutputFile(true))) {
178 | unlink($this->command->getOutputFile(true));
179 | }
180 | }
181 | }
182 |
--------------------------------------------------------------------------------
/src/TesseractOcrException.php:
--------------------------------------------------------------------------------
1 |