├── .appveyor.yml ├── MIT-LICENSE ├── README.md ├── codecov.yml ├── composer.json └── src ├── Command.php ├── FeatureNotAvailableException.php ├── FriendlyErrors.php ├── ImageNotFoundException.php ├── NoWritePermissionsForOutputFile.php ├── Option.php ├── Process.php ├── TesseractNotFoundException.php ├── TesseractOCR.php ├── TesseractOcrException.php └── UnsuccessfulCommandException.php /.appveyor.yml: -------------------------------------------------------------------------------- 1 | --- 2 | build: false 3 | 4 | install: 5 | - ps: Set-Service wuauserv -StartupType Manual 6 | - choco install php 7 | - choco install capture2text --version 3.9 8 | - choco install composer 9 | - refreshenv 10 | - cd %APPVEYOR_BUILD_FOLDER% 11 | - composer install 12 | 13 | test_script: 14 | - php tests\run.php unit e2e 15 | -------------------------------------------------------------------------------- /MIT-LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2021 Thiago Alessio Pereira 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tesseract OCR for PHP 2 | 3 | A wrapper to work with Tesseract OCR inside PHP. 4 | 5 | [![CI][ci_badge]][ci] 6 | [![AppVeyor][appveyor_badge]][appveyor] 7 | [![Codacy][codacy_badge]][codacy] 8 | [![Test Coverage][test_coverage_badge]][test_coverage] 9 |
10 | [![Latest Stable Version][stable_version_badge]][packagist] 11 | [![Total Downloads][total_downloads_badge]][packagist] 12 | [![Monthly Downloads][monthly_downloads_badge]][packagist] 13 | 14 | ## Installation 15 | 16 | Via [Composer][]: 17 | 18 | $ composer require thiagoalessio/tesseract_ocr 19 | 20 | :bangbang: **This library depends on [Tesseract OCR][], version _3.02_ or later.** 21 | 22 |
23 | 24 | ### ![][windows_icon] Note for Windows users 25 | 26 | There are [many ways][tesseract_installation_on_windows] to install 27 | [Tesseract OCR][] on your system, but if you just want something quick to 28 | get up and running, I recommend installing the [Capture2Text][] package with 29 | [Chocolatey][]. 30 | 31 | choco install capture2text --version 3.9 32 | 33 | :warning: Recent versions of [Capture2Text][] stopped shipping the `tesseract` binary. 34 | 35 |
36 | 37 | ### ![][macos_icon] Note for macOS users 38 | 39 | With [MacPorts][] you can install support for individual languages, like so: 40 | 41 | $ sudo port install tesseract- 42 | 43 | But that is not possible with [Homebrew][]. It comes only with **English** support 44 | by default, so if you intend to use it for other language, the quickest solution 45 | is to install them all: 46 | 47 | $ brew install tesseract tesseract-lang 48 | 49 |
50 | 51 | ## Usage 52 | 53 | ### Basic usage 54 | 55 | 56 | 57 | ```php 58 | use thiagoalessio\TesseractOCR\TesseractOCR; 59 | echo (new TesseractOCR('text.png')) 60 | ->run(); 61 | ``` 62 | 63 | ``` 64 | The quick brown fox 65 | jumps over 66 | the lazy dog. 67 | ``` 68 | 69 |
70 | 71 | ### Other languages 72 | 73 | 74 | 75 | ```php 76 | use thiagoalessio\TesseractOCR\TesseractOCR; 77 | echo (new TesseractOCR('german.png')) 78 | ->lang('deu') 79 | ->run(); 80 | ``` 81 | 82 | ``` 83 | Bülowstraße 84 | ``` 85 | 86 |
87 | 88 | ### Multiple languages 89 | 90 | 91 | 92 | ```php 93 | use thiagoalessio\TesseractOCR\TesseractOCR; 94 | echo (new TesseractOCR('mixed-languages.png')) 95 | ->lang('eng', 'jpn', 'spa') 96 | ->run(); 97 | ``` 98 | 99 | ``` 100 | I eat すし y Pollo 101 | ``` 102 | 103 |
104 | 105 | ### Inducing recognition 106 | 107 | 108 | 109 | ```php 110 | use thiagoalessio\TesseractOCR\TesseractOCR; 111 | echo (new TesseractOCR('8055.png')) 112 | ->allowlist(range('A', 'Z')) 113 | ->run(); 114 | ``` 115 | 116 | ``` 117 | BOSS 118 | ``` 119 | 120 |
121 | 122 | ### Breaking CAPTCHAs 123 | 124 | Yes, I know some of you might want to use this library for the *noble* purpose 125 | of breaking CAPTCHAs, so please take a look at this comment: 126 | 127 | 128 | 129 | ## API 130 | 131 | ### run 132 | 133 | Executes a `tesseract` command, optionally receiving an integer as `timeout`, 134 | in case you experience stalled tesseract processes. 135 | 136 | ```php 137 | $ocr = new TesseractOCR(); 138 | $ocr->run(); 139 | ``` 140 | ```php 141 | $ocr = new TesseractOCR(); 142 | $timeout = 500; 143 | $ocr->run($timeout); 144 | ``` 145 | 146 | ### image 147 | 148 | Define the path of an image to be recognized by `tesseract`. 149 | 150 | ```php 151 | $ocr = new TesseractOCR(); 152 | $ocr->image('/path/to/image.png'); 153 | $ocr->run(); 154 | ``` 155 | 156 | ### imageData 157 | 158 | Set the image to be recognized by `tesseract` from a string, with its size. 159 | This can be useful when dealing with files that are already loaded in memory. 160 | You can easily retrieve the image data and size of an image object : 161 | ```php 162 | //Using Imagick 163 | $data = $img->getImageBlob(); 164 | $size = $img->getImageLength(); 165 | //Using GD 166 | ob_start(); 167 | // Note that you can use any format supported by tesseract 168 | imagepng($img, null, 0); 169 | $size = ob_get_length(); 170 | $data = ob_get_clean(); 171 | 172 | $ocr = new TesseractOCR(); 173 | $ocr->imageData($data, $size); 174 | $ocr->run(); 175 | ``` 176 | 177 | ### executable 178 | 179 | Define a custom location of the `tesseract` executable, 180 | if by any reason it is not present in the `$PATH`. 181 | 182 | ```php 183 | echo (new TesseractOCR('img.png')) 184 | ->executable('/path/to/tesseract') 185 | ->run(); 186 | ``` 187 | 188 | ### version 189 | 190 | Returns the current version of `tesseract`. 191 | 192 | ```php 193 | echo (new TesseractOCR())->version(); 194 | ``` 195 | 196 | ### availableLanguages 197 | 198 | Returns a list of available languages/scripts. 199 | 200 | ```php 201 | foreach((new TesseractOCR())->availableLanguages() as $lang) echo $lang; 202 | ``` 203 | 204 | __More info:__ 205 | 206 | ### tessdataDir 207 | 208 | Specify a custom location for the tessdata directory. 209 | 210 | ```php 211 | echo (new TesseractOCR('img.png')) 212 | ->tessdataDir('/path') 213 | ->run(); 214 | ``` 215 | 216 | ### userWords 217 | 218 | Specify the location of user words file. 219 | 220 | This is a plain text file containing a list of words that you want to be 221 | considered as a normal dictionary words by `tesseract`. 222 | 223 | Useful when dealing with contents that contain technical terminology, jargon, 224 | etc. 225 | 226 | ``` 227 | $ cat /path/to/user-words.txt 228 | foo 229 | bar 230 | ``` 231 | 232 | ```php 233 | echo (new TesseractOCR('img.png')) 234 | ->userWords('/path/to/user-words.txt') 235 | ->run(); 236 | ``` 237 | 238 | ### userPatterns 239 | 240 | Specify the location of user patterns file. 241 | 242 | If the contents you are dealing with have known patterns, this option can help 243 | a lot tesseract's recognition accuracy. 244 | 245 | ``` 246 | $ cat /path/to/user-patterns.txt' 247 | 1-\d\d\d-GOOG-441 248 | www.\n\\\*.com 249 | ``` 250 | 251 | ```php 252 | echo (new TesseractOCR('img.png')) 253 | ->userPatterns('/path/to/user-patterns.txt') 254 | ->run(); 255 | ``` 256 | 257 | ### lang 258 | 259 | Define one or more languages to be used during the recognition. 260 | A complete list of available languages can be found at: 261 | 262 | 263 | __Tip from [@daijiale][]:__ Use the combination `->lang('chi_sim', 'chi_tra')` 264 | for proper recognition of Chinese. 265 | 266 | ```php 267 | echo (new TesseractOCR('img.png')) 268 | ->lang('lang1', 'lang2', 'lang3') 269 | ->run(); 270 | ``` 271 | 272 | ### psm 273 | 274 | Specify the Page Segmentation Method, which instructs `tesseract` how to 275 | interpret the given image. 276 | 277 | __More info:__ 278 | 279 | ```php 280 | echo (new TesseractOCR('img.png')) 281 | ->psm(6) 282 | ->run(); 283 | ``` 284 | 285 | ### oem 286 | 287 | Specify the OCR Engine Mode. (see `tesseract --help-oem`) 288 | 289 | ```php 290 | echo (new TesseractOCR('img.png')) 291 | ->oem(2) 292 | ->run(); 293 | ``` 294 | 295 | ### dpi 296 | 297 | Specify the image DPI. It is useful if your image does not contain this information in its metadata. 298 | 299 | ```php 300 | echo (new TesseractOCR('img.png')) 301 | ->dpi(300) 302 | ->run(); 303 | ``` 304 | 305 | ### allowlist 306 | 307 | This is a shortcut for `->config('tessedit_char_whitelist', 'abcdef....')`. 308 | 309 | ```php 310 | echo (new TesseractOCR('img.png')) 311 | ->allowlist(range('a', 'z'), range(0, 9), '-_@') 312 | ->run(); 313 | ``` 314 | 315 | ### configFile 316 | 317 | Specify a config file to be used. It can either be the path to your own 318 | config file or the name of one of the predefined config files: 319 | 320 | 321 | ```php 322 | echo (new TesseractOCR('img.png')) 323 | ->configFile('hocr') 324 | ->run(); 325 | ``` 326 | 327 | ### setOutputFile 328 | 329 | Specify an Outputfile to be used. Be aware: If you set an outputfile then 330 | the option `withoutTempFiles` is ignored. 331 | Tempfiles are written (and deleted) even if `withoutTempFiles = true`. 332 | 333 | In combination with `configFile` you are able to get the `hocr`, `tsv` or 334 | `pdf` files. 335 | 336 | ```php 337 | echo (new TesseractOCR('img.png')) 338 | ->configFile('pdf') 339 | ->setOutputFile('/PATH_TO_MY_OUTPUTFILE/searchable.pdf') 340 | ->run(); 341 | ``` 342 | 343 | ### digits 344 | 345 | Shortcut for `->configFile('digits')`. 346 | 347 | ```php 348 | echo (new TesseractOCR('img.png')) 349 | ->digits() 350 | ->run(); 351 | ``` 352 | 353 | ### hocr 354 | 355 | Shortcut for `->configFile('hocr')`. 356 | 357 | ```php 358 | echo (new TesseractOCR('img.png')) 359 | ->hocr() 360 | ->run(); 361 | ``` 362 | 363 | ### pdf 364 | 365 | Shortcut for `->configFile('pdf')`. 366 | 367 | ```php 368 | echo (new TesseractOCR('img.png')) 369 | ->pdf() 370 | ->run(); 371 | ``` 372 | 373 | ### quiet 374 | 375 | Shortcut for `->configFile('quiet')`. 376 | 377 | ```php 378 | echo (new TesseractOCR('img.png')) 379 | ->quiet() 380 | ->run(); 381 | ``` 382 | 383 | ### tsv 384 | 385 | Shortcut for `->configFile('tsv')`. 386 | 387 | ```php 388 | echo (new TesseractOCR('img.png')) 389 | ->tsv() 390 | ->run(); 391 | ``` 392 | 393 | ### txt 394 | 395 | Shortcut for `->configFile('txt')`. 396 | 397 | ```php 398 | echo (new TesseractOCR('img.png')) 399 | ->txt() 400 | ->run(); 401 | ``` 402 | 403 | ### tempDir 404 | 405 | Define a custom directory to store temporary files generated by tesseract. 406 | Make sure the directory actually exists and the user running `php` is allowed 407 | to write in there. 408 | 409 | ```php 410 | echo (new TesseractOCR('img.png')) 411 | ->tempDir('./my/custom/temp/dir') 412 | ->run(); 413 | ``` 414 | 415 | ### withoutTempFiles 416 | 417 | Specify that `tesseract` should output the recognized text without writing to temporary files. 418 | The data is gathered from the standard output of `tesseract` instead. 419 | 420 | ```php 421 | echo (new TesseractOCR('img.png')) 422 | ->withoutTempFiles() 423 | ->run(); 424 | ``` 425 | 426 | ### Other options 427 | 428 | Any configuration option offered by Tesseract can be used like that: 429 | 430 | ```php 431 | echo (new TesseractOCR('img.png')) 432 | ->config('config_var', 'value') 433 | ->config('other_config_var', 'other value') 434 | ->run(); 435 | ``` 436 | 437 | Or like that: 438 | 439 | ```php 440 | echo (new TesseractOCR('img.png')) 441 | ->configVar('value') 442 | ->otherConfigVar('other value') 443 | ->run(); 444 | ``` 445 | 446 | __More info:__ 447 | 448 | ### Thread-limit 449 | 450 | Sometimes, it may be useful to limit the number of threads that tesseract is 451 | allowed to use (e.g. in [this case](https://github.com/tesseract-ocr/tesseract/issues/898)). 452 | Set the maxmium number of threads as param for the `run` function: 453 | 454 | ```php 455 | echo (new TesseractOCR('img.png')) 456 | ->threadLimit(1) 457 | ->run(); 458 | ``` 459 | 460 | ## How to contribute 461 | 462 | You can contribute to this project by: 463 | 464 | * Opening an [Issue][] if you found a bug or wish to propose a new feature; 465 | * Placing a [Pull Request][] with code that fix a bug, missing/wrong documentation 466 | or implement a new feature; 467 | 468 | Just make sure you take a look at our [Code of Conduct][] and [Contributing][] 469 | instructions. 470 | 471 | ## License 472 | 473 | tesseract-ocr-for-php is released under the [MIT License][]. 474 | 475 | 476 |

Made with love in Berlin

477 | 478 | [ci_badge]: https://github.com/thiagoalessio/tesseract-ocr-for-php/workflows/CI/badge.svg?event=push&branch=main 479 | [ci]: https://github.com/thiagoalessio/tesseract-ocr-for-php/actions?query=workflow%3ACI 480 | [appveyor_badge]: https://ci.appveyor.com/api/projects/status/xwy5ls0798iwcim3/branch/main?svg=true 481 | [appveyor]: https://ci.appveyor.com/project/thiagoalessio/tesseract-ocr-for-php/branch/main 482 | [codacy_badge]: https://app.codacy.com/project/badge/Grade/a81aa10012874f23a57df5b492d835f2 483 | [codacy]: https://app.codacy.com/gh/thiagoalessio/tesseract-ocr-for-php/dashboard 484 | [test_coverage_badge]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php/branch/main/graph/badge.svg?token=Y0VnrqiSIf 485 | [test_coverage]: https://codecov.io/gh/thiagoalessio/tesseract-ocr-for-php 486 | [stable_version_badge]: https://img.shields.io/packagist/v/thiagoalessio/tesseract_ocr.svg 487 | [packagist]: https://packagist.org/packages/thiagoalessio/tesseract_ocr 488 | [total_downloads_badge]: https://img.shields.io/packagist/dt/thiagoalessio/tesseract_ocr.svg 489 | [monthly_downloads_badge]: https://img.shields.io/packagist/dm/thiagoalessio/tesseract_ocr.svg 490 | [Tesseract OCR]: https://github.com/tesseract-ocr/tesseract 491 | [Composer]: http://getcomposer.org/ 492 | [windows_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/windows-18.svg 493 | [macos_icon]: https://thiagoalessio.github.io/tesseract-ocr-for-php/images/apple-18.svg 494 | [tesseract_installation_on_windows]: https://github.com/tesseract-ocr/tesseract/wiki#windows 495 | [Capture2Text]: https://chocolatey.org/packages/capture2text 496 | [Chocolatey]: https://chocolatey.org 497 | [MacPorts]: https://www.macports.org 498 | [Homebrew]: https://brew.sh 499 | [@daijiale]: https://github.com/daijiale 500 | [HOCR]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#hocr-output 501 | [TSV]: https://github.com/tesseract-ocr/tesseract/wiki/Command-Line-Usage#tsv-output-currently-available-in-305-dev-in-master-branch-on-github 502 | [Issue]: https://github.com/thiagoalessio/tesseract-ocr-for-php/issues 503 | [Pull Request]: https://github.com/thiagoalessio/tesseract-ocr-for-php/pulls 504 | [Code of Conduct]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CODE_OF_CONDUCT.md 505 | [Contributing]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/.github/CONTRIBUTING.md 506 | [MIT License]: https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/MIT-LICENSE 507 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | fixes: 2 | - "/home/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::" 3 | - "/Users/runner/work/tesseract-ocr-for-php/tesseract-ocr-for-php/::" 4 | - "C:\\projects\\tesseract-ocr-for-php\\::" 5 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "thiagoalessio/tesseract_ocr", 3 | "description": "A wrapper to work with Tesseract OCR inside PHP.", 4 | "version": "2.13.0", 5 | "type": "library", 6 | "keywords": ["Tesseract", "OCR", "text recognition"], 7 | "license": "MIT", 8 | "authors": [ 9 | { 10 | "name": "thiagoalessio", 11 | "email": "thiagoalessio@me.com" 12 | } 13 | ], 14 | "support": { 15 | "issues": "https://github.com/thiagoalessio/tesseract-ocr-for-php/issues", 16 | "irc": "irc://irc.freenode.net/tesseract-ocr-for-php", 17 | "source": "https://github.com/thiagoalessio/tesseract-ocr-for-php" 18 | }, 19 | "require": { 20 | "php": "^5.5 || ^7.0 || ^8.0" 21 | }, 22 | "require-dev": { 23 | "phpunit/php-code-coverage": "^2.2.4 || ^9.0.0" 24 | }, 25 | "autoload": { 26 | "psr-4": { 27 | "thiagoalessio\\TesseractOCR\\": "src/" 28 | } 29 | }, 30 | "autoload-dev": { 31 | "psr-4": { 32 | "thiagoalessio\\TesseractOCR\\Tests\\": "tests/" 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/Command.php: -------------------------------------------------------------------------------- 1 | image = $image; 19 | $this->outputFile = $outputFile; 20 | } 21 | 22 | public function build() { return "$this"; } 23 | 24 | public function __toString() 25 | { 26 | $cmd = array(); 27 | if ($this->threadLimit) $cmd[] = "OMP_THREAD_LIMIT={$this->threadLimit}"; 28 | $cmd[] = self::escape($this->executable); 29 | $cmd[] = $this->useFileAsInput ? self::escape($this->image) : "-"; 30 | $cmd[] = $this->useFileAsOutput ? self::escape($this->getOutputFile(false)) : "-"; 31 | 32 | $version = $this->getTesseractVersion(); 33 | 34 | foreach ($this->options as $option) { 35 | $cmd[] = is_callable($option) ? $option($version) : "$option"; 36 | } 37 | if ($this->configFile) $cmd[] = $this->configFile; 38 | 39 | return join(' ', $cmd); 40 | } 41 | 42 | public function getOutputFile($withExt=true) 43 | { 44 | if (!$this->outputFile) 45 | $this->outputFile = $this->getTempDir() 46 | .DIRECTORY_SEPARATOR 47 | .basename(tempnam($this->getTempDir(), 'ocr')); 48 | if (!$withExt) return $this->outputFile; 49 | 50 | $hasCustomExt = array('hocr', 'tsv', 'pdf'); 51 | $ext = in_array($this->configFile, $hasCustomExt) ? $this->configFile : 'txt'; 52 | return "{$this->outputFile}.{$ext}"; 53 | } 54 | 55 | public function getTempDir() 56 | { 57 | return $this->tempDir ?: sys_get_temp_dir(); 58 | } 59 | 60 | public function getTesseractVersion() 61 | { 62 | exec(self::escape($this->executable).' --version 2>&1', $output); 63 | $outputParts = explode(' ', $output[0]); 64 | return $outputParts[1]; 65 | } 66 | 67 | public function getAvailableLanguages() 68 | { 69 | exec(self::escape($this->executable) . ' --list-langs 2>&1', $output); 70 | array_shift($output); 71 | sort($output); 72 | return $output; 73 | } 74 | 75 | public static function escape($str) 76 | { 77 | $charlist = strtoupper(substr(PHP_OS, 0, 3)) == 'WIN' ? '$"`' : '$"\\`'; 78 | return '"'.addcslashes($str, $charlist).'"'; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/FeatureNotAvailableException.php: -------------------------------------------------------------------------------- 1 | NUL 2>&1' 25 | : 'type '.Command::escape($executable).' > /dev/null 2>&1'; 26 | system($cmd, $exitCode); 27 | 28 | if ($exitCode == 0) return; 29 | 30 | $currentPath = getenv('PATH'); 31 | $msg = array(); 32 | $msg[] = "Error! The command \"$executable\" was not found."; 33 | $msg[] = ''; 34 | $msg[] = 'Make sure you have Tesseract OCR installed on your system:'; 35 | $msg[] = 'https://github.com/tesseract-ocr/tesseract'; 36 | $msg[] = ''; 37 | $msg[] = "The current \$PATH is $currentPath"; 38 | $msg = join(PHP_EOL, $msg); 39 | 40 | throw new TesseractNotFoundException($msg); 41 | } 42 | 43 | public static function checkCommandExecution($command, $stdout, $stderr) 44 | { 45 | if ($command->useFileAsOutput) { 46 | $file = $command->getOutputFile(); 47 | if (file_exists($file) && filesize($file) > 0) return; 48 | } 49 | 50 | if (!$command->useFileAsOutput && $stdout) { 51 | return; 52 | } 53 | 54 | $msg = array(); 55 | $msg[] = 'Error! The command did not produce any output.'; 56 | $msg[] = ''; 57 | $msg[] = 'Generated command:'; 58 | $msg[] = "$command"; 59 | $msg[] = ''; 60 | $msg[] = 'Returned message:'; 61 | $arrayStderr = explode(PHP_EOL, $stderr); 62 | array_pop($arrayStderr); 63 | $msg = array_merge($msg, $arrayStderr); 64 | $msg = join(PHP_EOL, $msg); 65 | 66 | throw new UnsuccessfulCommandException($msg); 67 | } 68 | 69 | public static function checkProcessCreation($processHandle, $command) 70 | { 71 | if ($processHandle !== FALSE) return; 72 | 73 | $msg = array(); 74 | $msg[] = 'Error! The command could not be launched.'; 75 | $msg[] = ''; 76 | $msg[] = 'Generated command:'; 77 | $msg[] = "$command"; 78 | $msg = join(PHP_EOL, $msg); 79 | 80 | throw new UnsuccessfulCommandException($msg); 81 | } 82 | 83 | public static function checkTesseractVersion($expected, $action, $command) 84 | { 85 | $actual = $command->getTesseractVersion(); 86 | 87 | if ($actual[0] === 'v') 88 | $actual = substr($actual, 1); 89 | 90 | if (version_compare($actual, $expected, ">=")) return; 91 | 92 | $msg = array(); 93 | $msg[] = "Error! $action is not available this tesseract version"; 94 | $msg[] = "Required version is $expected, actual version is $actual"; 95 | $msg[] = ''; 96 | $msg[] = 'Generated command:'; 97 | $msg[] = "$command"; 98 | $msg = join(PHP_EOL, $msg); 99 | 100 | throw new FeatureNotAvailableException($msg); 101 | } 102 | 103 | public static function checkWritePermissions($path) 104 | { 105 | if (!is_dir(dirname($path))) mkdir(dirname($path)); 106 | $writableDirectory = is_writable(dirname($path)); 107 | $writableFile = true; 108 | if (file_exists($path)) $writableFile = is_writable($path); 109 | if ($writableFile && $writableDirectory) return; 110 | 111 | $msg = array(); 112 | $msg[] = "Error! No permission to write to $path"; 113 | $msg[] = "Make sure you have the right outputFile and permissions " 114 | ."to write to the folder"; 115 | $msg[] = ''; 116 | $msg = join(PHP_EOL, $msg); 117 | 118 | throw new NoWritePermissionsForOutputFile($msg); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/ImageNotFoundException.php: -------------------------------------------------------------------------------- 1 | =') ? '-' : '')."-psm $psm"; 10 | }; 11 | } 12 | 13 | public static function oem($oem) 14 | { 15 | return function($version) use ($oem) { 16 | Option::checkMinVersion('3.05', $version, 'oem'); 17 | return "--oem $oem"; 18 | }; 19 | } 20 | 21 | public static function dpi($dpi) 22 | { 23 | return function() use ($dpi) { 24 | return "--dpi $dpi"; 25 | }; 26 | } 27 | 28 | public static function userWords($path) 29 | { 30 | return function($version) use ($path) { 31 | Option::checkMinVersion('3.04', $version, 'user-words'); 32 | return '--user-words "'.addcslashes($path, '\\"').'"'; 33 | }; 34 | } 35 | 36 | public static function userPatterns($path) 37 | { 38 | return function($version) use ($path) { 39 | Option::checkMinVersion('3.04', $version, 'user-patterns'); 40 | return '--user-patterns "'.addcslashes($path, '\\"').'"'; 41 | }; 42 | } 43 | 44 | public static function tessdataDir($path) 45 | { 46 | return function() use ($path) { 47 | return '--tessdata-dir "'.addcslashes($path, '\\"').'"'; 48 | }; 49 | } 50 | 51 | public static function lang() 52 | { 53 | $languages = func_get_args(); 54 | return function() use ($languages) { 55 | return '-l '.join('+', $languages); 56 | }; 57 | } 58 | 59 | public static function config($var, $value) 60 | { 61 | return function() use($var, $value) { 62 | $snakeCase = function($str) { 63 | return strtolower(preg_replace('/([A-Z])+/', '_$1', $str)); 64 | }; 65 | $pair = $snakeCase($var).'='.$value; 66 | return '-c "'.addcslashes($pair, '\\"').'"'; 67 | }; 68 | } 69 | 70 | public static function checkMinVersion($minVersion, $currVersion, $option) 71 | { 72 | $minVersion = preg_replace('/^v/', '', $minVersion); 73 | $currVersion = preg_replace('/^v/', '', $currVersion); 74 | if (!version_compare($currVersion, $minVersion, '<')) return; 75 | $msg = "$option option is only available on Tesseract $minVersion or later."; 76 | $msg.= PHP_EOL."Your version of Tesseract is $currVersion"; 77 | throw new \Exception($msg); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/Process.php: -------------------------------------------------------------------------------- 1 | startTime = microtime(true); 14 | $streamDescriptors = [ 15 | array("pipe", "r"), 16 | array("pipe", "w"), 17 | array("pipe", "w") 18 | ]; 19 | $this->handle = proc_open($command, $streamDescriptors, $pipes, NULL, NULL, ["bypass_shell" => true]); 20 | list($this->stdin, $this->stdout, $this->stderr) = $pipes; 21 | 22 | FriendlyErrors::checkProcessCreation($this->handle, $command); 23 | 24 | //This is can avoid deadlock on some cases (when stderr buffer is filled up before writing to stdout and vice-versa) 25 | stream_set_blocking($this->stdout, 0); 26 | stream_set_blocking($this->stderr, 0); 27 | } 28 | 29 | public function write($data, $len) 30 | { 31 | $total = 0; 32 | do 33 | { 34 | $res = fwrite($this->stdin, substr($data, $total)); 35 | } while($res && $total += $res < $len); 36 | return $total === $len; 37 | } 38 | 39 | 40 | public function wait($timeout = 0) 41 | { 42 | $running = true; 43 | $data = ["out" => "", "err" => ""]; 44 | while (($running === true) && !$this->hasTimedOut($timeout)) 45 | { 46 | $data["out"] .= fread($this->stdout, 8192); 47 | $data["err"] .= fread($this->stderr, 8192); 48 | $procInfo = proc_get_status($this->handle); 49 | $running = $procInfo["running"]; 50 | if ($running) { 51 | usleep(1000); // Sleep 1ms to yield CPU time 52 | } 53 | } 54 | return $data; 55 | } 56 | 57 | public function close() 58 | { 59 | $this->closeStream($this->stdin); 60 | $this->closeStream($this->stdout); 61 | $this->closeStream($this->stderr); 62 | return proc_close($this->handle); 63 | } 64 | 65 | public function closeStdin() 66 | { 67 | $this->closeStream($this->stdin); 68 | } 69 | 70 | private function hasTimedOut($timeout) 71 | { 72 | return (($timeout > 0) && ($this->startTime + $timeout < microtime(true))); 73 | } 74 | 75 | private function closeStream(&$stream) 76 | { 77 | if ($stream !== NULL) 78 | { 79 | fclose($stream); 80 | $stream = NULL; 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/TesseractNotFoundException.php: -------------------------------------------------------------------------------- 1 | command = $command ?: new Command; 15 | $this->image("$image"); 16 | } 17 | 18 | public function run($timeout = 0) 19 | { 20 | try { 21 | if ($this->outputFile !== null) { 22 | FriendlyErrors::checkWritePermissions($this->outputFile); 23 | $this->command->useFileAsOutput = true; 24 | } 25 | 26 | FriendlyErrors::checkTesseractPresence($this->command->executable); 27 | if ($this->command->useFileAsInput) { 28 | FriendlyErrors::checkImagePath($this->command->image); 29 | } 30 | 31 | $process = new Process("{$this->command}"); 32 | 33 | if (!$this->command->useFileAsInput) { 34 | $process->write($this->command->image, $this->command->imageSize); 35 | $process->closeStdin(); 36 | } 37 | $output = $process->wait($timeout); 38 | 39 | FriendlyErrors::checkCommandExecution($this->command, $output["out"], $output["err"]); 40 | } 41 | catch (TesseractOcrException $e) { 42 | if ($this->command->useFileAsOutput) $this->cleanTempFiles(); 43 | throw $e; 44 | } 45 | 46 | if ($this->command->useFileAsOutput) { 47 | $text = file_get_contents($this->command->getOutputFile()); 48 | 49 | if ($this->outputFile !== null) { 50 | rename($this->command->getOutputFile(), $this->outputFile); 51 | } 52 | 53 | $this->cleanTempFiles(); 54 | } 55 | else 56 | $text = $output["out"]; 57 | 58 | return trim($text, " \t\n\r\0\x0A\x0B\x0C"); 59 | } 60 | 61 | public function imageData($image, $size) 62 | { 63 | FriendlyErrors::checkTesseractVersion("3.03-rc1", "Reading image data from stdin", $this->command); 64 | $this->command->useFileAsInput = false; 65 | $this->command->image = $image; 66 | $this->command->imageSize = $size; 67 | return $this; 68 | } 69 | 70 | public function withoutTempFiles() 71 | { 72 | FriendlyErrors::checkTesseractVersion("3.03-rc1", "Writing to stdout (without using temp files)", $this->command); 73 | $this->command->useFileAsOutput = false; 74 | return $this; 75 | } 76 | 77 | public function image($image) 78 | { 79 | $this->command->image = $image; 80 | return $this; 81 | } 82 | 83 | public function executable($executable) 84 | { 85 | FriendlyErrors::checkTesseractPresence($executable); 86 | $this->command->executable = $executable; 87 | return $this; 88 | } 89 | 90 | public function configFile($configFile) 91 | { 92 | $this->command->configFile = $configFile; 93 | return $this; 94 | } 95 | 96 | public function tempDir($tempDir) 97 | { 98 | $this->command->tempDir = $tempDir; 99 | return $this; 100 | } 101 | 102 | public function threadLimit($limit) 103 | { 104 | $this->command->threadLimit = $limit; 105 | return $this; 106 | } 107 | 108 | // @deprecated 109 | public function format($fmt) { return $this->configFile($fmt); } 110 | 111 | public function setOutputFile($path) { 112 | $this->outputFile = $path; 113 | return $this; 114 | } 115 | 116 | public function allowlist() 117 | { 118 | $concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; }; 119 | $allowlist = join('', array_map($concat, func_get_args())); 120 | $this->command->options[] = Option::config('tessedit_char_whitelist', $allowlist); 121 | return $this; 122 | } 123 | 124 | public function whitelist() 125 | { 126 | $warningMsg = 'Notice: whitelist is deprecated, use allowlist instead.'; 127 | trigger_error($warningMsg, E_USER_NOTICE); 128 | 129 | $concat = function ($arg) { return is_array($arg) ? join('', $arg) : $arg; }; 130 | $allowlist = join('', array_map($concat, func_get_args())); 131 | return $this->allowlist($allowlist); 132 | } 133 | 134 | public function version() 135 | { 136 | return $this->command->getTesseractVersion(); 137 | } 138 | 139 | public function availableLanguages() 140 | { 141 | return $this->command->getAvailableLanguages(); 142 | } 143 | 144 | public function __call($method, $args) 145 | { 146 | if ($this->isConfigFile($method)) return $this->configFile($method); 147 | if ($this->isOption($method)) { 148 | $option = $this->getOptionClassName().'::'.$method; 149 | $this->command->options[] = call_user_func_array($option, $args); 150 | return $this; 151 | } 152 | $arg = empty($args) ? null : $args[0]; 153 | $this->command->options[] = Option::config($method, $arg); 154 | return $this; 155 | } 156 | 157 | private function isConfigFile($name) 158 | { 159 | return in_array($name, array('digits', 'hocr', 'pdf', 'quiet', 'tsv', 'txt')); 160 | } 161 | 162 | private function isOption($name) 163 | { 164 | return in_array($name, get_class_methods($this->getOptionClassName())); 165 | } 166 | 167 | private function getOptionClassName() 168 | { 169 | return __NAMESPACE__.'\\Option'; 170 | } 171 | 172 | private function cleanTempFiles() 173 | { 174 | if (file_exists($this->command->getOutputFile(false))) { 175 | unlink($this->command->getOutputFile(false)); 176 | } 177 | if (file_exists($this->command->getOutputFile(true))) { 178 | unlink($this->command->getOutputFile(true)); 179 | } 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/TesseractOcrException.php: -------------------------------------------------------------------------------- 1 |