├── .gitattributes ├── .gitignore ├── .php-cs-fixer.cache ├── LICENSE.md ├── README.md ├── _config.yml ├── blobs ├── danger.png ├── sticky-notes.png ├── ubuntu.png ├── warning.png └── windows.png ├── composer.json ├── composer.lock ├── config └── textract.php ├── index.md ├── lang └── en │ ├── extractor.php │ ├── file.php │ ├── processor.php │ └── tesseract.php ├── src ├── Concerns │ └── TextractOutput.php ├── Exceptions │ └── TextractException.php ├── ExtractorService │ ├── Contracts │ │ ├── AbstractExtractor.php │ │ ├── AbstractTextExtractor.php │ │ ├── HasPhpWord.php │ │ └── TextProcessorHaveFilter.php │ ├── ExtractorCommonProcessors │ │ ├── PhpPowerPointProcessor.php │ │ ├── PhpSheetProcessor.php │ │ ├── PhpWordProcessor.php │ │ └── TextProcessor.php │ ├── Extractors │ │ ├── HtmlExtractor.php │ │ ├── ImageExtractor.php │ │ ├── MsOfficeDocExtractor.php │ │ ├── MsOfficeDocxExtractor.php │ │ ├── MsOfficeExcelExtractor.php │ │ ├── MsOfficePptxExtractor.php │ │ ├── OpenOfficeDocument.php │ │ ├── OpenOfficeSpreadSheet.php │ │ ├── PdfExtractor.php │ │ ├── RtfExtractor.php │ │ └── TxtExtractor.php │ └── Ocr │ │ ├── Contracts │ │ └── TesseractOcrOptions.php │ │ └── TesseractOcrRun.php ├── Providers │ └── ServiceProvider.php ├── Services │ ├── ConsoleExtractionService.php │ ├── ExtractService.php │ └── UtilsService.php └── Textract.php ├── storage ├── example-multi-languages.png ├── example.doc ├── example.docx ├── example.epub ├── example.ods ├── example.odt ├── example.pdf ├── example.png ├── example.rtf ├── example.txt ├── example.xls ├── example.xlsx └── exmple-mix-ben.pdf └── tests └── ExtractionTest.php /.gitattributes: -------------------------------------------------------------------------------- 1 | *.doc linguist-detectable=false 2 | *.docx linguist-detectable=false 3 | *.ods linguist-detectable=false 4 | *.odt linguist-detectable=false 5 | *.pdf linguist-detectable=false 6 | *.png linguist-detectable=false 7 | *.rtf linguist-detectable=false 8 | *.txt linguist-detectable=false 9 | *.xls linguist-detectable=false 10 | *.xlsx linguist-detectable=false 11 | *.php linguist-detectable=true 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Composer template 2 | composer.phar 3 | .idea/* 4 | /vendor/ 5 | 6 | # Commit your application's lock file https://getcomposer.org/doc/01-basic-usage.md#commit-your-composer-lock-file-to-version-control 7 | # You may choose to ignore a library lock file http://getcomposer.org/doc/02-libraries.md#lock-file 8 | # composer.lock 9 | 10 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright 2022 | Niladri Shekhar Mondal 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Packagist](https://img.shields.io/packagist/v/nilgems/laravel-textract)](https://packagist.org/packages/nilgems/laravel-textract) 2 | # Laravel Textract 3 | A [Laravel](https://laravel.com) package to extract text from files like DOC, Excel, Image, Pdf and more. 4 | 5 | # Versions and compatibility 6 | 7 | - [Laravel 10](https://laravel.com) or higher is required. 8 | - [Php 8.2]() or higher is required 9 | 10 | ### Supported file formats 11 | Following file formats is supported currently. You need to install proper extensions 12 | to your server to work with all the following extension related files. The package will 13 | check file content MIME type before execute. 14 | - **HTML** 15 | - **TEXT** 16 | - **DOC** 17 | - **DOCX** 18 | - **XLS**, **XLSX**, **XLSM**, **XLTX**, **XLTM**, **XLT** 19 | - **CSV** 20 | - **PDF** 21 | - **Image** 22 | - _jpeg_ 23 | - _png_ 24 | - _gif_ 25 | - **ODT** 26 | - **ODS** 27 | - **RTF** 28 | - **PPTX** (NEW) 29 | 30 | **We are working hard to make this laravel plugin useful. If you found any issue please add a post on discussion.** 31 | 32 | ### Installation 33 | 34 | ``` 35 | composer require nilgems/laravel-textract 36 | ``` 37 | Once installed you can do stuff like this: 38 | ``` 39 | # Run the extractor 40 | $output = Textract::run('/path/to/file.extension'); 41 | 42 | # Display the extracted text 43 | echo $output->text; 44 | 45 | # Display the extracted text word count 46 | echo $output->word_count; 47 | 48 | # Display the extracted text with direct string conversion 49 | echo (string) $output; 50 | ``` 51 | Run the extractor to any supported file: 52 | ``` 53 | Textract::run(string $file_path, [string $job_id],[TesseractOcrOptions $extra_data]); 54 | ``` 55 | | Option | Type | Default value | Required | Description | 56 | |:-----------:|:-------------------------:|:------------------:|:--------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| 57 | | $file_path | ```String``` | _No default value_ | Yes | Text extractable file absolute path. | 58 | | $job_id | ```String``` | ```NULL``` | No | It's a optional parameter. Extraction **job id**. If this option is blank the plugin will auto create the **ID** | 59 | | $extra_data | ```TesseractOcrOptions``` | ```NULL``` | No | It's a optional parameter. To pass extra parameter. If you are extracting a image file, you can mention languages and more by this ```Nilgems\PhpTextract\ExtractorService\Ocr\Contracts\TesseractOcrOptions``` **parameter**. | 60 | 61 | ### Configuration 62 | 63 | - You can add **provider** in ```app.php``` under the ```config``` folder of your 64 | [Laravel](https://laravel.com) project. It's optional, the package automatically load the service provider in your application. 65 | ``` 66 | 'providers' => [ 67 | ... 68 | Nilgems\PhpTextract\Providers\ServiceProvider, 69 | ... 70 | ] 71 | ``` 72 | - Add **alias** in ```app.php``` under the ```config``` folder of your 73 | [Laravel](https://laravel.com) project. It's optional, the package automatically load the ```facade``` in your application. 74 | ``` 75 | 'aliases' => [ 76 | ... 77 | 'Textract' => Nilgems\PhpTextract\Textract::class, 78 | ... 79 | ] 80 | ``` 81 | - To publish the ```config``` file, run: 82 | ``` 83 | php artisan vendor:publish --tag=textract 84 | ``` 85 | ### Example 86 | 87 | ##### Example 1: 88 | You can extract text from supported file format. 89 | 90 | It is recommended to use the extractor with [Laravel Queue Job](https://laravel.com/docs/9.x/queues#creating-jobs) from better performance.

91 | In ```php``` there have a restriction of execution time and memory limit defined in ```php.ini``` file with the option ```max_execution_time``` and ```memory_limit```. If file size is big, the process may kill forcefully when exceed the limit. You can use ```queue - database/redis``` or ```Laravel horizon``` to run the process in background. 92 | ``` 93 | ........ 94 | Route::get('/textract', function(){ 95 | return Textract::run('/path/to/image/example.png'); 96 | }); 97 | ........ 98 | ``` 99 | 100 | ##### Example 2: 101 | If you need to specify languages in image file for better extraction output from image file. 102 | ``` 103 | ........ 104 | Route::get('/textract', function(){ 105 | return Textract::run('/path/to/image/example.png', null, [ 106 | 'lang' => ['eng', 'jpn', 'spa'] 107 | ]); 108 | }); 109 | ........ 110 | ``` 111 | ### Dependencies 112 | - To enable the image extraction feature you need to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) 113 | - To enable the PDF extraction feature you need to install [pdftotext](http://www.xpdfreader.com/download.html) 114 | - To work properly, your server must have following php extensions installed - 115 | - **ext-fileinfo** 116 | - **ext-zip** 117 | - **ext-gd** or **ext-imagick** 118 | - **ext-xml** 119 | ### Tesseract OCR Installation 120 | #### Ubuntu Ubuntu 121 | - Update the system: ```sudo apt update``` 122 | - Add Tesseract OCR 5 PPA to your system: ```sudo add-apt-repository ppa:alex-p/tesseract-ocr-devel``` 123 | - Install Tesseract on Ubuntu 20.04 | 18.04: ```sudo apt install -y tesseract-ocr``` 124 | - Once installation is complete update your system: ```sudo apt update``` 125 | - Verify the installation: ```tesseract --version``` 126 | #### Ubuntu Windows 127 | - There are many [ways](https://github.com/tesseract-ocr/tesseract/wiki#windows) to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) on your system, but if you just want something quick to get up and running, I recommend installing the [Capture2Text](https://chocolatey.org/packages/capture2text) package with [Chocolatey](https://chocolatey.org/). 128 | - Choco installation: ```choco install capture2text --version 5.0``` 129 | 130 | **Note: Recent versions of [Capture2Text](https://chocolatey.org/packages/capture2text) stopped shipping the ```tesseract``` binary** 131 | 132 | ### PdfToText Installation 133 | #### Ubuntu Ubuntu 134 | - Update the system: ```sudo apt update``` 135 | - Install PdfToText on Ubuntu 20.04 | 18.04: ```sudo apt-get install poppler-utils``` 136 | - Verify the installation: ```pdftotext -v``` 137 | #### Ubuntu Windows 138 | - Sorry but ```pdftotext``` available via [poppler](https://poppler.freedesktop.org/) and the [poppler](https://poppler.freedesktop.org/) is not available yet for windows. But you can install and [use the library by windows linux sub-system WLS](https://towardsdatascience.com/poppler-on-windows-179af0e50150). Alternatively, you can install [Laravel Homestead](https://laravel.com/docs/9.x/homestead) in your project and using vagrant virtualization you can run the project in ubuntu virtual server. 139 | 140 | ## License 141 | 142 | [MIT](https://choosealicense.com/licenses/mit/) 143 | 144 | --- 145 | ## 💻 Tech Stack 146 | ![CSS3](https://img.shields.io/badge/css3-%231572B6.svg?style=plastic&logo=css3&logoColor=white) ![PHP](https://img.shields.io/badge/php-%23777BB4.svg?style=plastic&logo=php&logoColor=white) ![HTML5](https://img.shields.io/badge/html5-%23E34F26.svg?style=plastic&logo=html5&logoColor=white) ![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=plastic&logo=javascript&logoColor=%23F7DF1E) ![AWS](https://img.shields.io/badge/AWS-%23FF9900.svg?style=plastic&logo=amazon-aws&logoColor=white) ![Vue.js](https://img.shields.io/badge/vuejs-%2335495e.svg?style=plastic&logo=vuedotjs&logoColor=%234FC08D) ![Vuetify](https://img.shields.io/badge/Vuetify-1867C0?style=plastic&logo=vuetify&logoColor=AEDDFF) ![NPM](https://img.shields.io/badge/NPM-%23000000.svg?style=plastic&logo=npm&logoColor=white) ![jQuery](https://img.shields.io/badge/jquery-%230769AD.svg?style=plastic&logo=jquery&logoColor=white) ![Express.js](https://img.shields.io/badge/express.js-%23404d59.svg?style=plastic&logo=express&logoColor=%2361DAFB) ![Laravel](https://img.shields.io/badge/laravel-%23FF2D20.svg?style=plastic&logo=laravel&logoColor=white) ![NuxtJS](https://img.shields.io/badge/Nuxt-black?style=plastic&logo=nuxt.js&logoColor=white) ![Socket.io](https://img.shields.io/badge/Socket.io-black?style=plastic&logo=socket.io&badgeColor=010101) ![Apache](https://img.shields.io/badge/apache-%23D42029.svg?style=plastic&logo=apache&logoColor=white) ![MariaDB](https://img.shields.io/badge/MariaDB-003545?style=plastic&logo=mariadb&logoColor=white) ![MongoDB](https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=plastic&logo=mongodb&logoColor=white) ![MySQL](https://img.shields.io/badge/mysql-%2300f.svg?style=plastic&logo=mysql&logoColor=white) ![SQLite](https://img.shields.io/badge/sqlite-%2307405e.svg?style=plastic&logo=sqlite&logoColor=white) ![Inkscape](https://img.shields.io/badge/Inkscape-e0e0e0?style=plastic&logo=inkscape&logoColor=080A13) ![Jira](https://img.shields.io/badge/jira-%230A0FFF.svg?style=plastic&logo=jira&logoColor=white) ![Vagrant](https://img.shields.io/badge/vagrant-%231563FF.svg?style=plastic&logo=vagrant&logoColor=white) 147 | 148 | --- 149 | [![](https://visitcount.itsvg.in/api?id=NilGems&icon=0&color=0)](https://visitcount.itsvg.in) 150 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-merlot -------------------------------------------------------------------------------- /blobs/danger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/danger.png -------------------------------------------------------------------------------- /blobs/sticky-notes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/sticky-notes.png -------------------------------------------------------------------------------- /blobs/ubuntu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/ubuntu.png -------------------------------------------------------------------------------- /blobs/warning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/warning.png -------------------------------------------------------------------------------- /blobs/windows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/windows.png -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nilgems/laravel-textract", 3 | "description": "A Laravel package to extract text from files like DOC, XL, Image, Pdf and more. I've developed this package by inspiring \"npm textract\".", 4 | "type": "library", 5 | "keywords": [ 6 | "laravel", 7 | "plugin", 8 | "package", 9 | "text", 10 | "textract", 11 | "extract", 12 | "html", 13 | "csv", 14 | "text", 15 | "pdf", 16 | "docx", 17 | "doc", 18 | "xls", 19 | "xlsx", 20 | "png", 21 | "jpg", 22 | "rtf", 23 | "xml", 24 | "odt", 25 | "ott", 26 | "xlsb", 27 | "xlsm", 28 | "xltx", 29 | "ods" 30 | ], 31 | "require": { 32 | "php": "^8.2", 33 | "ext-fileinfo": "*", 34 | "ext-zip": "*", 35 | "ext-xml": "*", 36 | "ext-gd": "*", 37 | "symfony/process": "^6.4.3", 38 | "phpoffice/phpspreadsheet": "^1.23", 39 | "phpoffice/phpword": "^0.18", 40 | "laravel/framework": "^12.0", 41 | "thiagoalessio/tesseract_ocr": "^2.12", 42 | "html2text/html2text": "^4.3", 43 | "phpoffice/phppresentation": "^1.0" 44 | }, 45 | "require-dev": { 46 | "phpunit/phpunit": "^9.5" 47 | }, 48 | "license": "MIT", 49 | "autoload": { 50 | "psr-4": { 51 | "Nilgems\\PhpTextract\\": "src/" 52 | } 53 | }, 54 | "extra": { 55 | "laravel": { 56 | "providers": [ 57 | "Nilgems\\PhpTextract\\Providers\\ServiceProvider" 58 | ], 59 | "aliases": { 60 | "Textract":"Nilgems\\PhpTextract\\Textract" 61 | } 62 | } 63 | }, 64 | "authors": [ 65 | { 66 | "name": "Niladri Shekhar Mondal", 67 | "email": "nldrmondal35@gmail.com" 68 | } 69 | ], 70 | "minimum-stability": "stable" 71 | } 72 | -------------------------------------------------------------------------------- /config/textract.php: -------------------------------------------------------------------------------- 1 | [ 17 | /* 18 | | ------------------------------------------------------------------------------------------------------------- 19 | | OCR enabled or disabled: 20 | | ------------------------------------------------------------------------------------------------------------- 21 | | 22 | | 23 | | Enable or disable the OCR functionality here. By default, the OCR is enabled and the code will check the plugin 24 | | is already installed or not in your server before do any operation. If the plugin is not installed/disabled the image 25 | | file extraction will not work. 26 | | 27 | */ 28 | 'enabled' => env('TEXTRACT_OCR_ENABLED', true), 29 | /* 30 | | ------------------------------------------------------------------------------------------------------------- 31 | | OCR custom executable path 32 | | ------------------------------------------------------------------------------------------------------------- 33 | | 34 | | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#executable 35 | | 36 | */ 37 | 'executable_path' => env('TEXTRACT_OCR_EXEC_PATH', null), 38 | 39 | /* 40 | | ------------------------------------------------------------------------------------------------------------- 41 | | OCR inducing recognition 42 | | ------------------------------------------------------------------------------------------------------------- 43 | | 44 | | 45 | | By default, the value is 'null' and OCR will automatically recognise the text and try to extract whole text. 46 | | If you defined the path, the OCR will be able to extract those text that will match with the patterns inside 47 | | the text file. 48 | | 49 | | 50 | | Pattern example you can write inside the text file: 51 | | 1-\d\d\d-GOOG-441 52 | | www.\n\\\*.com 53 | | 54 | | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#inducing-recognition 55 | | 56 | */ 57 | 'text_patterns_path' => env('TEXTRACT_OCR_TEXT_PATTERNS_PATH', null), 58 | 59 | /* 60 | | ------------------------------------------------------------------------------------------------------------- 61 | | OCR thread limit 62 | | ------------------------------------------------------------------------------------------------------------- 63 | | 64 | | 65 | | The value of limit will be a integer value. 0 - Mean all available thread. 66 | | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#thread-limit 67 | | 68 | */ 69 | 'thread_limit' => env('TEXTRACT_OCR_THREAD_LIMIT', 0), 70 | 71 | /* 72 | | ------------------------------------------------------------------------------------------------------------- 73 | | OCR custom dictionary text file path. 74 | | ------------------------------------------------------------------------------------------------------------- 75 | | 76 | | 77 | | By default, the value is 'null' 78 | | Fore more details pleases visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#userpatterns 79 | | 80 | */ 81 | 'text_dictionary_path' => env('TEXTRACT_OCR_TEXT_DICTIONARY_PATH', null), 82 | 83 | /* 84 | |------------------------------------------------------------------------------------------------------------- 85 | | OCR other custom configurations 86 | |------------------------------------------------------------------------------------------------------------- 87 | | 88 | | 89 | | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#other-options 90 | | 91 | */ 92 | 'config' => [], 93 | 94 | /* 95 | |------------------------------------------------------------------------------------------------------------- 96 | | OCR Temporary file storage directory 97 | |------------------------------------------------------------------------------------------------------------- 98 | | 99 | | OCR custom temporary folder storage path. Make sure the path have proper permissions to access by PHP. 100 | */ 101 | 'temp_dir' => null 102 | ] 103 | 104 | ]; 105 | -------------------------------------------------------------------------------- /index.md: -------------------------------------------------------------------------------- 1 | [![Packagist](https://img.shields.io/packagist/v/nilgems/laravel-textract)](https://packagist.org/packages/nilgems/laravel-textract) 2 | # Laravel Textract 3 | A [Laravel](https://laravel.com) package to extract text from files like DOC, Excel, Image, Pdf and more. 4 | 5 | # Versions and compatibility 6 | 7 | - [Laravel 8](https://laravel.com) or higher is required. 8 | - [Php 7.4]() or higher is required 9 | 10 | ### Note [Laravel 9](https://laravel.com) support is added. 11 | 12 | ### Supported file formats 13 | Following file formats is supported currently. You need to install proper extensions 14 | to your server to work with all the following extension related files. The package will 15 | check file content MIME type before execute. 16 | - **HTML** 17 | - **TEXT** 18 | - **DOC** 19 | - **DOCX** 20 | - **XLS**, **XLSX**, **XLSM**, **XLTX**, **XLTM**, **XLT** 21 | - **CSV** 22 | - **PDF** 23 | - **Image** 24 | - _jpeg_ 25 | - _png_ 26 | - **ODT** 27 | - **ODS** 28 | - **RTF** 29 | 30 | Note***GIF*** and ***PPT*** support is under development. 31 | 32 | **We are working hard to make this laravel plugin useful. If you found any issue please add a post on discussion.** 33 | 34 | ### Installation 35 | 36 | ``` 37 | composer require nilgems/laravel-textract 38 | ``` 39 | Once installed you can do stuff like this: 40 | ``` 41 | # Run the extractor 42 | $output = Textract::run('/path/to/file.extension'); 43 | 44 | # Display the extracted text 45 | echo $output->text; 46 | 47 | # Display the extracted text word count 48 | echo $output->word_count; 49 | 50 | # Display the extracted text with direct string conversion 51 | echo (string) $output; 52 | ``` 53 | Run the extractor to any supported file: 54 | ``` 55 | Textract::run(string $file_path, [string $job_id],[array $extra_data]); 56 | ``` 57 | | Option | Type | Default value | Required | Description | 58 | |:-----------:|:------:|:------------------:|:--------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| 59 | | $file_path | String | _No default value_ | Yes | Text extractable file absolute path. | 60 | | $job_id | String | ```NULL``` | No | It's a optional parameter. Extraction **job id**. If this option is blank the plugin will auto create the **ID** | 61 | | $extra_data | array | [] | No | It's a optional parameter. To pass extra parameter. If you are extracting a image file, you can mention languages by this **parameter**. ``` ['lang' => ['eng', 'jpn', 'spa']] ``` | 62 | 63 | ### Configuration 64 | 65 | - You can add **provider** in ```app.php``` under the ```config``` folder of your 66 | [Laravel](https://laravel.com) project. It's optional, the package automatically load the service provider in your application. 67 | ``` 68 | 'providers' => [ 69 | ... 70 | Nilgems\PhpTextract\Providers\ServiceProvider, 71 | ... 72 | ] 73 | ``` 74 | - Add **alias** in ```app.php``` under the ```config``` folder of your 75 | [Laravel](https://laravel.com) project. It's optional, the package automatically load the ```facade``` in your application. 76 | ``` 77 | 'aliases' => [ 78 | ... 79 | 'Textract' => Nilgems\PhpTextract\Textract::class, 80 | ... 81 | ] 82 | ``` 83 | ### Example 84 | 85 | ##### Example 1: 86 | You can extract text from supported file format. 87 | 88 | It is recommended to use the extractor with [Laravel Queue Job](https://laravel.com/docs/9.x/queues#creating-jobs) from better performance.

89 | In ```php``` there have a restriction of execution time and memory limit defined in ```php.ini``` file with the option ```max_execution_time``` and ```memory_limit```. If file size is big, the process may kill forcefully when exceed the limit. You can use ```queue - database/redis``` or ```Laravel horizon``` to run the process in background. 90 | ``` 91 | ........ 92 | Route::get('/textract', function(){ 93 | return Textract::run('/path/to/image/example.png'); 94 | }); 95 | ........ 96 | ``` 97 | 98 | ##### Example 2: 99 | If you need to specify languages in image file for better extraction output from image file. 100 | ``` 101 | ........ 102 | Route::get('/textract', function(){ 103 | return Textract::run('/path/to/image/example.png', null, [ 104 | 'lang' => ['eng', 'jpn', 'spa'] 105 | ]); 106 | }); 107 | ........ 108 | ``` 109 | ### Dependencies 110 | - To enable the image extraction feature you need to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) 111 | - To enable the PDF extraction feature you need to install [pdftotext](http://www.xpdfreader.com/download.html) 112 | - To work properly, your server must have following php extensions installed - 113 | - **ext-fileinfo** 114 | - **ext-zip** 115 | - **ext-gd** or **ext-imagick** 116 | - **ext-xml** 117 | ### Tesseract OCR Installation 118 | #### Ubuntu Ubuntu 119 | - Update the system: ```sudo apt update``` 120 | - Add Tesseract OCR 5 PPA to your system: ```sudo add-apt-repository ppa:alex-p/tesseract-ocr-devel``` 121 | - Install Tesseract on Ubuntu 20.04 | 18.04: ```sudo apt install -y tesseract-ocr``` 122 | - Once installation is complete update your system: ```sudo apt update``` 123 | - Verify the installation: ```tesseract --version``` 124 | #### Ubuntu Windows 125 | - There are many [ways](https://github.com/tesseract-ocr/tesseract/wiki#windows) to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) on your system, but if you just want something quick to get up and running, I recommend installing the [Capture2Text](https://chocolatey.org/packages/capture2text) package with [Chocolatey](https://chocolatey.org/). 126 | - Choco installation: ```choco install capture2text --version 5.0``` 127 | 128 | **Note: Recent versions of [Capture2Text](https://chocolatey.org/packages/capture2text) stopped shipping the ```tesseract``` binary** 129 | 130 | ### PdfToText Installation 131 | #### Ubuntu Ubuntu 132 | - Update the system: ```sudo apt update``` 133 | - Install PdfToText on Ubuntu 20.04 | 18.04: ```sudo apt-get install poppler-utils``` 134 | - Verify the installation: ```pdftotext -v``` 135 | #### Ubuntu Windows 136 | - Sorry but ```pdftotext``` available via [poppler](https://poppler.freedesktop.org/) and the [poppler](https://poppler.freedesktop.org/) is not available yet for windows. But you can install and [use the library by windows linux sub-system WLS](https://towardsdatascience.com/poppler-on-windows-179af0e50150). Alternatively, you can install [Laravel Homestead](https://laravel.com/docs/9.x/homestead) in your project and using vagrant virtualization you can run the project in ubuntu virtual server. 137 | 138 | ## License 139 | 140 | [MIT](https://choosealicense.com/licenses/mit/) 141 | 142 | --- 143 | ## 💻 Tech Stack 144 | ![CSS3](https://img.shields.io/badge/css3-%231572B6.svg?style=plastic&logo=css3&logoColor=white) ![PHP](https://img.shields.io/badge/php-%23777BB4.svg?style=plastic&logo=php&logoColor=white) ![HTML5](https://img.shields.io/badge/html5-%23E34F26.svg?style=plastic&logo=html5&logoColor=white) ![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=plastic&logo=javascript&logoColor=%23F7DF1E) ![AWS](https://img.shields.io/badge/AWS-%23FF9900.svg?style=plastic&logo=amazon-aws&logoColor=white) ![Vue.js](https://img.shields.io/badge/vuejs-%2335495e.svg?style=plastic&logo=vuedotjs&logoColor=%234FC08D) ![Vuetify](https://img.shields.io/badge/Vuetify-1867C0?style=plastic&logo=vuetify&logoColor=AEDDFF) ![NPM](https://img.shields.io/badge/NPM-%23000000.svg?style=plastic&logo=npm&logoColor=white) ![jQuery](https://img.shields.io/badge/jquery-%230769AD.svg?style=plastic&logo=jquery&logoColor=white) ![Express.js](https://img.shields.io/badge/express.js-%23404d59.svg?style=plastic&logo=express&logoColor=%2361DAFB) ![Laravel](https://img.shields.io/badge/laravel-%23FF2D20.svg?style=plastic&logo=laravel&logoColor=white) ![NuxtJS](https://img.shields.io/badge/Nuxt-black?style=plastic&logo=nuxt.js&logoColor=white) ![Socket.io](https://img.shields.io/badge/Socket.io-black?style=plastic&logo=socket.io&badgeColor=010101) ![Apache](https://img.shields.io/badge/apache-%23D42029.svg?style=plastic&logo=apache&logoColor=white) ![MariaDB](https://img.shields.io/badge/MariaDB-003545?style=plastic&logo=mariadb&logoColor=white) ![MongoDB](https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=plastic&logo=mongodb&logoColor=white) ![MySQL](https://img.shields.io/badge/mysql-%2300f.svg?style=plastic&logo=mysql&logoColor=white) ![SQLite](https://img.shields.io/badge/sqlite-%2307405e.svg?style=plastic&logo=sqlite&logoColor=white) ![Inkscape](https://img.shields.io/badge/Inkscape-e0e0e0?style=plastic&logo=inkscape&logoColor=080A13) ![Jira](https://img.shields.io/badge/jira-%230A0FFF.svg?style=plastic&logo=jira&logoColor=white) ![Vagrant](https://img.shields.io/badge/vagrant-%231563FF.svg?style=plastic&logo=vagrant&logoColor=white) 145 | 146 | --- 147 | [![](https://visitcount.itsvg.in/api?id=NilGems&icon=0&color=0)](https://visitcount.itsvg.in) 148 | -------------------------------------------------------------------------------- /lang/en/extractor.php: -------------------------------------------------------------------------------- 1 | 'The content of file in {path} path with extension .{extension} is|are match with MIME type {mime_types}', 4 | 'error_supported_extension_not_defined' => 'Supported extension is not defined in extractor.', 5 | 'error_pdf_of_extension_not_installed' => '\'pdftotext\' does not appear to be installed. Please check the documentation - https://github.com/NilGems/laravel-textract#pdftotext-installation' 6 | ]; 7 | -------------------------------------------------------------------------------- /lang/en/file.php: -------------------------------------------------------------------------------- 1 | 'The provided file path is invalid, any file is not available in {path} path. Please can check the file have proper permission or exists.' 4 | ]; 5 | -------------------------------------------------------------------------------- /lang/en/processor.php: -------------------------------------------------------------------------------- 1 | 'The provided file of {path} path is not readable.' 4 | ]; 5 | -------------------------------------------------------------------------------- /lang/en/tesseract.php: -------------------------------------------------------------------------------- 1 | '\'tesseract\' does not appear to be installed. Please check the document - https://github.com/NilGems/laravel-textract#tesseract-ocr-installation', 4 | 'error_file_extension_txt_required' => 'Only \'text\' file is supported. Please provide file with .txt extension.', 5 | 'error_input_invalid' => 'Invalid value is provided' 6 | ]; -------------------------------------------------------------------------------- /src/Concerns/TextractOutput.php: -------------------------------------------------------------------------------- 1 | collection = new Collection([ 22 | 'text' => htmlspecialchars($raw_output, ENT_NOQUOTES, "UTF-8"), 23 | 'word_count' => str_word_count(utf8_decode($raw_output), 0) 24 | ]); 25 | } 26 | 27 | /** 28 | * To array 29 | * @return array 30 | */ 31 | public function toArray(): array 32 | { 33 | return $this->collection->toArray(); 34 | } 35 | 36 | public function __get(string $key) 37 | { 38 | return $this->collection->get($key); 39 | } 40 | 41 | public function __set(string $key, string $value) 42 | { 43 | $this->collection->put($key, $value); 44 | } 45 | 46 | public function __isset(string $key) 47 | { 48 | return $this->collection->has($key); 49 | } 50 | 51 | /** 52 | * To string 53 | * @return string 54 | */ 55 | public function __toString(): string 56 | { 57 | return $this->collection->get('text'); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/Exceptions/TextractException.php: -------------------------------------------------------------------------------- 1 | data = new Collection([]); 42 | } 43 | 44 | /** 45 | * Set data 46 | * @param $key 47 | * @param $value 48 | * @return $this 49 | */ 50 | public function setData($key, $value): self 51 | { 52 | $this->data->put($key, $value); 53 | return $this; 54 | } 55 | /** 56 | * Get accept mime types 57 | * @return array 58 | */ 59 | public function getAcceptMimeTypes(): array 60 | { 61 | if (method_exists($this, 'mimeAccepts')) { 62 | return $this->mimeAccepts(); 63 | } 64 | return $this->mime_accepts; 65 | } 66 | 67 | /** 68 | * Get acceptable extensions 69 | * @return array 70 | */ 71 | public function getAcceptExtensions(): array 72 | { 73 | return $this->extractor_supported_extension; 74 | } 75 | /** 76 | * Has match mime type 77 | * @param string $mime_type 78 | * @return bool 79 | */ 80 | public function hasMatchMimeType(string $mime_type): bool 81 | { 82 | $acceptable_mime_type = $this->getAcceptMimeTypes(); 83 | if (empty($acceptable_mime_type)) { 84 | return true; 85 | } 86 | return in_array(strtolower($mime_type), $acceptable_mime_type, true); 87 | } 88 | 89 | /** 90 | * @param string $file_path 91 | * @param array $data 92 | * @return string|null 93 | * @throws TextractException 94 | */ 95 | public function boot(string $file_path, array $data = []): ?string 96 | { 97 | $this->file_path = $file_path; 98 | $this->data = $this->data->merge($data); 99 | $utilsService = app(UtilsService::class)->setFilePath($file_path); 100 | $utilsService->setFilePath($file_path); 101 | $this->current_mime_type = $utilsService->getFileMimeType(); 102 | if (!$this->hasMatchMimeType($this->current_mime_type)) { 103 | throw new TextractException( 104 | $this->extractor_name . 105 | ' unable to process the file. Please ensure the content of file is a ' . 106 | implode('/', $this->extractor_supported_extension) . 'file.' 107 | ); 108 | } 109 | $has_valid = $this->checkHaveProviderPackage(); 110 | if ($has_valid) { 111 | return $this->getTextFromFile(); 112 | } 113 | throw new TextractException($this->error_message); 114 | } 115 | 116 | abstract protected function checkHaveProviderPackage(); 117 | 118 | abstract protected function getTextFromFile(); 119 | } 120 | -------------------------------------------------------------------------------- /src/ExtractorService/Contracts/AbstractTextExtractor.php: -------------------------------------------------------------------------------- 1 | utilsService = $utilsService; 36 | if ($this->hasSupportedExtensionDefined() && $this->utilsService->getFilePath() && $this->hasMatchMimeType()) { 37 | return $this->getExtractedText(); 38 | } 39 | return ""; 40 | } 41 | 42 | /** 43 | * Check the supported file format is defined or not. 44 | * @return bool 45 | * @throws TextractException 46 | */ 47 | private function hasSupportedExtensionDefined(): bool 48 | { 49 | if (!empty($this->supported_extension)) { 50 | return true; 51 | } 52 | throw new TextractException(trans('textract::extractor.error_supported_extension_not_defined')); 53 | } 54 | 55 | /** 56 | * Check the mime type of file provided via path is match or not 57 | * @return bool 58 | * @throws TextractException 59 | */ 60 | private function hasMatchMimeType(): bool 61 | { 62 | $current_file_mime_type = strtolower($this->utilsService->getFileMimeType()); 63 | $is_match_mime_type = collect($this->supported_mime_types) 64 | ->transform(function ($mime_type) { 65 | return strtolower($mime_type); 66 | }) 67 | ->filter(function ($mime_type) use ($current_file_mime_type) { 68 | return Str::of($mime_type)->exactly($current_file_mime_type); 69 | }) 70 | ->count() > 0; 71 | if (!$is_match_mime_type) { 72 | throw new TextractException(trans_choice('textract::extractor.error_mime_mismatch', count($this->supported_extension), [ 73 | 'path' => $this->utilsService->getFilePath(), 74 | 'extension' => implode(', .', $this->supported_extension), 75 | 'mime_types' => implode(', ', $this->supported_mime_types) 76 | ])); 77 | } 78 | return true; 79 | } 80 | 81 | abstract protected function getExtractedText(): string; 82 | } 83 | -------------------------------------------------------------------------------- /src/ExtractorService/Contracts/HasPhpWord.php: -------------------------------------------------------------------------------- 1 | getSections() as $section) { 21 | $elements = $section->getElements(); 22 | $data = [...$data, ...$this->getElementText($elements)]; 23 | } 24 | return implode(" ", array_filter($data)); 25 | } 26 | 27 | /** 28 | * @param array $elements 29 | * @return array 30 | */ 31 | protected function getElementText(array $elements): array 32 | { 33 | $docs = []; 34 | foreach ($elements as $element) { 35 | if ($element instanceof PhpWordElementText) { 36 | $docs[] = trim($element->getText()); 37 | } 38 | if ($element instanceof PhpWordElementTextRun) { 39 | $nested_data = $this->getElementText($element->getElements()); 40 | $docs = [...$docs, ...$nested_data]; 41 | } 42 | } 43 | return $docs; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/ExtractorService/Contracts/TextProcessorHaveFilter.php: -------------------------------------------------------------------------------- 1 | utilsService->getFilePath(); 20 | $reader = IOFactory::createReader($this->readerType); 21 | try { 22 | $presentation = $reader->load($file_path); 23 | return count($presentation->getAllSlides()) > 0; 24 | } catch (\Exception $exception) { 25 | throw $exception; 26 | report($exception); 27 | throw new TextractException(trans('textract::processor.error_unable_to_read', [ 28 | 'path' => $this->utilsService->getFilePath() 29 | ])); 30 | } 31 | 32 | } 33 | 34 | /** 35 | * @throws TextractException 36 | * @throws \PhpOffice\PhpSpreadsheet\Reader\Exception 37 | */ 38 | protected function getExtractedText(): string 39 | { 40 | if ($this->hasReadable()) { 41 | $data_iterable = []; 42 | $reader = IOFactory::createReader($this->readerType); 43 | $presentation = $reader->load($this->utilsService->getFilePath()); 44 | foreach ($presentation->getAllSlides() as $slide) { 45 | $shapes = $slide->getShapeCollection(); 46 | foreach ($shapes as $shape_k => $shape_v) { 47 | $shape = $shapes[$shape_k]; 48 | if($shape instanceof Shape\RichText){ 49 | $paragraphs = $shapes[$shape_k]->getParagraphs(); 50 | foreach ($paragraphs as $paragraph_k => $paragraph_v) { 51 | $text_elements = $paragraph_v->getRichTextElements(); 52 | foreach ($text_elements as $text_element_k => $text_element_v) { 53 | $data_iterable[] = $text_element_v->getText(); 54 | } 55 | } 56 | } 57 | } 58 | } 59 | return implode("\n", $data_iterable); 60 | } 61 | return ""; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/ExtractorService/ExtractorCommonProcessors/PhpSheetProcessor.php: -------------------------------------------------------------------------------- 1 | utilsService->getFilePath(); 19 | $has_readable = IOFactory::createReaderForFile($file_path)->canRead($file_path); 20 | if ($has_readable) { 21 | return true; 22 | } 23 | throw new TextractException(trans('textract::processor.error_unable_to_read', [ 24 | 'path' => $this->utilsService->getFilePath() 25 | ])); 26 | } 27 | 28 | /** 29 | * @throws TextractException 30 | * @throws \PhpOffice\PhpSpreadsheet\Reader\Exception 31 | */ 32 | protected function getExtractedText(): string 33 | { 34 | if ($this->hasReadable()) { 35 | $data_iterable = []; 36 | $spreadsheet = IOFactory::load($this->utilsService->getFilePath()); 37 | foreach ($spreadsheet->getAllSheets() as $sheet) { 38 | foreach ($sheet->toArray() as $item) { 39 | $data_iterable[] = implode(',', array_filter($item)); 40 | } 41 | } 42 | return implode("\n", $data_iterable); 43 | } 44 | return ""; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/ExtractorService/ExtractorCommonProcessors/PhpWordProcessor.php: -------------------------------------------------------------------------------- 1 | hasReadable()) { 23 | return $this->getSectionsText(); 24 | } 25 | return ""; 26 | } 27 | 28 | /** 29 | * Has the file is readable 30 | * @return bool 31 | * @throws TextractException 32 | * @throws PhpWordException|TextractException 33 | */ 34 | private function hasReadable(): bool 35 | { 36 | $has_read_permission = IOFactory::createReader($this->reader_name) 37 | ->canRead($this->utilsService->getFilePath()); 38 | if ($has_read_permission) { 39 | return true; 40 | } 41 | throw new TextractException(trans('textract::processor.error_unable_to_read', [ 42 | 'path' => $this->utilsService->getFilePath() 43 | ])); 44 | } 45 | 46 | /** 47 | * Collect section wise text from the Word file 48 | * @return string 49 | * @throws TextractException 50 | */ 51 | protected function getSectionsText(): string 52 | { 53 | $output = []; 54 | $phpWord = IOFactory::load($this->utilsService->getFilePath(), $this->reader_name); 55 | foreach ($phpWord->getSections() as $section) { 56 | $elements = $section->getElements(); 57 | $output[] = $this->getElementText($elements); 58 | } 59 | return implode(" ", array_filter($output)); 60 | } 61 | 62 | /** 63 | * @param array $elements 64 | * @return string 65 | */ 66 | protected function getElementText(array $elements): string 67 | { 68 | $output = []; 69 | foreach ($elements as $element) { 70 | if ($element instanceof PhpWordElementText) { 71 | $output[] = trim($element->getText()); 72 | } 73 | if ($element instanceof PhpWordElementTextRun) { 74 | $output[] = $this->getElementText($element->getElements()); 75 | } 76 | } 77 | return implode(" ", $output); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/ExtractorService/ExtractorCommonProcessors/TextProcessor.php: -------------------------------------------------------------------------------- 1 | hasReadable()) { 18 | $file_size = filesize($this->utilsService->getFilePath()); 19 | $read_data = fread($file_resource, $file_size); 20 | fclose($file_resource); 21 | if ($this instanceof TextProcessorHaveFilter) { 22 | return $this->getExtractedText($read_data); 23 | } 24 | return $read_data; 25 | } 26 | return ""; 27 | } 28 | 29 | /** 30 | * @return resource 31 | * @throws TextractException 32 | */ 33 | private function hasReadable() 34 | { 35 | if ($file_resource = fopen($this->utilsService->getFilePath(), 'rb')) { 36 | return $file_resource; 37 | } 38 | throw new TextractException(trans('textract::processor.error_unable_to_read', [ 39 | 'path' => $this->utilsService->getFilePath() 40 | ])); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/ExtractorService/Extractors/HtmlExtractor.php: -------------------------------------------------------------------------------- 1 | getText(); 26 | } 27 | return ""; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/ExtractorService/Extractors/ImageExtractor.php: -------------------------------------------------------------------------------- 1 | ocrOptions = $ocrOptions; 33 | return $this; 34 | } 35 | 36 | /** 37 | * @return string 38 | * @throws \Nilgems\PhpTextract\Exceptions\TextractException 39 | * @throws \thiagoalessio\TesseractOCR\TesseractOcrException 40 | */ 41 | protected function getExtractedText(): string 42 | { 43 | return app(TesseractOcrRun::class) 44 | ->boot($this->utilsService, $this->ocrOptions); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/ExtractorService/Extractors/MsOfficeDocExtractor.php: -------------------------------------------------------------------------------- 1 | hasOsExtensionInstalled()) { 24 | $file_path = $this->utilsService->getFilePath(); 25 | $process = new Process(['pdftotext', '-layout', $file_path , '-']); 26 | $process->start(); 27 | $process->wait(); 28 | return $this->getFilteredOutput($process); 29 | } 30 | return ""; 31 | } 32 | 33 | /** 34 | * Has 'pdftotext' extension is installed or enabled in OS. 35 | * @return bool 36 | * @throws TextractException 37 | */ 38 | private function hasOsExtensionInstalled(): bool 39 | { 40 | $process = new Process(['pdftotext', '-v']); 41 | $process->start(); 42 | $process->wait(); 43 | $output = $this->getFilteredOutput($process); 44 | $has_extension = (bool) preg_match('/pdftotext([\s]+)version/', $output); 45 | if ($has_extension) { 46 | return true; 47 | } 48 | throw new TextractException(trans('extractor.error_pdf_of_extension_not_installed')); 49 | } 50 | 51 | /** 52 | * @param Process $process 53 | * @return string 54 | */ 55 | private function getFilteredOutput(Process $process): string 56 | { 57 | $output = $process->getOutput(); 58 | $output_error = $process->getErrorOutput(); 59 | if (!empty($output)) { 60 | return $output; 61 | } 62 | return $output_error; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/ExtractorService/Extractors/RtfExtractor.php: -------------------------------------------------------------------------------- 1 | options = [ 16 | 'executable' => config('textract.ocr.executable_path', null), 17 | 'tempDir' => config('textract.ocr.temp_dir', null), 18 | 'userWords' => config('textract.ocr.text_dictionary_path'), 19 | 'userPatterns' => config('textract.ocr.text_patterns_path'), 20 | 'lang' => [], 21 | 'allowlist' => [], 22 | 'configVar' => config('textract.ocr.config'), 23 | 'psm' => null, 24 | 'dpi' => null, 25 | 'threadLimit' => config('textract.ocr.thread_limit'), 26 | 27 | ]; 28 | } 29 | 30 | /** 31 | * @param string $path 32 | * @return $this 33 | */ 34 | public function setTempDir(string $path): self 35 | { 36 | $this->options['tempDir'] = $path; 37 | return $this; 38 | } 39 | 40 | /** 41 | * Add languages 42 | * @param array $language 43 | * @return $this 44 | */ 45 | public function setLanguage(array $language): self 46 | { 47 | $this->options['lang'] = $language; 48 | return $this; 49 | } 50 | 51 | /** 52 | * @param int $psm 53 | * @return $this 54 | */ 55 | public function setPsm(int $psm): self 56 | { 57 | $this->options['psm'] = $psm; 58 | return $this; 59 | } 60 | 61 | /** 62 | * @param array $list 63 | * @return $this 64 | */ 65 | public function setAllowList(array $list): self 66 | { 67 | $this->options['allowlist'] = $list; 68 | return $this; 69 | } 70 | 71 | public function toArray(): array 72 | { 73 | return array_filter($this->options, static function ($option_value) { 74 | return !empty($option_value); 75 | }); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/ExtractorService/Ocr/TesseractOcrRun.php: -------------------------------------------------------------------------------- 1 | utilsService = $utilsService; 25 | $is_enabled = config('textract.ocr.enabled', false); 26 | if ($is_enabled && $this->hasOsExtension() && $this->utilsService->getFilePath()) { 27 | return $this->getOcr($ocrOptions)->run(); 28 | } 29 | return ""; 30 | } 31 | 32 | protected function getOcr(TesseractOcrOptions $ocrOptions = null): TesseractOCR 33 | { 34 | if ($ocrOptions === null) { 35 | $ocrOptions = new TesseractOcrOptions(); 36 | } 37 | $ocr = new TesseractOCR($this->utilsService->getFilePath()); 38 | $ocr->withoutTempFiles(); 39 | if ($ocrOptions) { 40 | foreach ($ocrOptions->toArray() as $option_key => $option_value) { 41 | if (is_array($option_value) || is_iterable($option_value)) { 42 | $ocr->{$option_key}(...$option_value); 43 | } else { 44 | $ocr->{$option_key}($option_value); 45 | } 46 | } 47 | } 48 | return $ocr; 49 | } 50 | 51 | /** 52 | * @return bool 53 | * @throws TextractException 54 | */ 55 | protected function hasOsExtension(): bool 56 | { 57 | $tesseractPath = config('textract.ocr.executable_path', 'tesseract'); // C:\Program Files\Tesseract-OCR\tesseract.exe 58 | $process = new Process([$tesseractPath, '-v']); 59 | $process->start(); 60 | $process->wait(); 61 | $output = $this->getConsoleOutput($process); 62 | $has_installed = (bool) preg_match('/tesseract([\s]+)((v)?[0-9.]+)/', $output); 63 | if ($has_installed) { 64 | return true; 65 | } 66 | throw new TextractException(trans('textract::tesseract.error_not_installed')); 67 | } 68 | 69 | /** 70 | * @param Process $process 71 | * @return string 72 | */ 73 | protected function getConsoleOutput(Process $process): string 74 | { 75 | if ($output = $process->getOutput()) { 76 | return $output; 77 | } 78 | return $process->getErrorOutput(); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/Providers/ServiceProvider.php: -------------------------------------------------------------------------------- 1 | publishes([ 25 | __DIR__ . '/../../config/textract.php' => config_path('textract.php') 26 | ], 'textract'); 27 | } 28 | 29 | /** 30 | * Register services 31 | * @return void 32 | */ 33 | public function register(): void 34 | { 35 | $this->mergeConfigFrom(__DIR__ . '/../../config/textract.php', 'textract'); 36 | $this->loadTranslationsFrom(__DIR__ . '/../../lang', 'textract'); 37 | 38 | $this->app->bind(UtilsService::class); 39 | $this->app->bind('textract', ExtractService::class); 40 | $this->app->bind(ConsoleExtractionService::class); 41 | 42 | $this->registerExtractors(); 43 | } 44 | 45 | /** 46 | * Register extractors to the application 47 | * @return void 48 | */ 49 | protected function registerExtractors(): void 50 | { 51 | $extractors = [ 52 | HtmlExtractor::class, 53 | ImageExtractor::class, 54 | MsOfficeDocExtractor::class, 55 | MsOfficeDocxExtractor::class, 56 | MsOfficePptxExtractor::class, 57 | OpenOfficeDocument::class, 58 | OpenOfficeSpreadSheet::class, 59 | PdfExtractor::class, 60 | RtfExtractor::class, 61 | TxtExtractor::class 62 | ]; 63 | foreach ($extractors as $extractor) { 64 | $this->app->bind($extractor); 65 | } 66 | 67 | $this->app->tag($extractors, 'extractors'); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/Services/ConsoleExtractionService.php: -------------------------------------------------------------------------------- 1 | file_path = $file_path; 40 | $this->job_id = (string) ($job_id ?? Str::uuid()); 41 | $this->utilsService = app(UtilsService::class); 42 | $this->utilsService->setFilePath($this->file_path); 43 | $output = $this->utilsService->getExtractor()->boot($this->utilsService); 44 | return new TextractOutput($output); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/Services/ExtractService.php: -------------------------------------------------------------------------------- 1 | file_path = $file_path; 20 | $this->job_id = (string) ($job_id ?? Str::uuid()); 21 | return app(ConsoleExtractionService::class) 22 | ->boot($this->file_path, $this->job_id); 23 | } 24 | 25 | /** 26 | * Get file path 27 | * @return string 28 | */ 29 | public function getFilePath(): string 30 | { 31 | return $this->file_path; 32 | } 33 | 34 | /** 35 | * Get job id 36 | * @return string 37 | */ 38 | public function getJobId(): string 39 | { 40 | return $this->job_id; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/Services/UtilsService.php: -------------------------------------------------------------------------------- 1 | file_path = $file_path; 52 | $this->file_name = $this->getFileName(); 53 | $this->file_extension = $this->getFileExtension(); 54 | $this->file_mime_type = $this->getFileMimeType(); 55 | $this->extractor_collection = collect(app()->tagged('extractors')); 56 | $this->supported_file_extensions = (clone $this->extractor_collection) 57 | ->transform(function (AbstractTextExtractor $extractor) { 58 | return $extractor->supported_extension; 59 | }) 60 | ->flatten() 61 | ->toArray(); 62 | return $this; 63 | } 64 | 65 | /** 66 | * Get the file path 67 | * @return string 68 | * @throws TextractException 69 | */ 70 | public function getFilePath(): string 71 | { 72 | if ($this->fileIsExists()) { 73 | return $this->file_path; 74 | } 75 | throw new TextractException(trans('textract::file.error_not_exists', ['path' => $this->file_path])); 76 | } 77 | 78 | /** 79 | * Is the extractable file exists/file path is valid or not 80 | * @return bool 81 | */ 82 | protected function fileIsExists(): bool 83 | { 84 | if (isset($this->file_path)) { 85 | return file_exists($this->file_path); 86 | } 87 | return false; 88 | } 89 | 90 | /** 91 | * Get the extractor 92 | * @return AbstractTextExtractor 93 | * @throws TextractException 94 | */ 95 | public function getExtractor(): AbstractTextExtractor 96 | { 97 | if (isset($this->file_mime_type)) { 98 | $selected_extractor = (clone $this->extractor_collection) 99 | ->filter(function (AbstractTextExtractor $extractor) { 100 | return in_array($this->file_extension, $extractor->supported_extension, true); 101 | }); 102 | if ($selected_extractor->count() > 0) { 103 | return $selected_extractor->first(); 104 | } 105 | throw new TextractException( 106 | "Invalid file format. Only support ". 107 | implode('/', $this->supported_file_extensions). 108 | " files" 109 | ); 110 | } 111 | throw new TextractException("Please provide a file to extract text from that."); 112 | } 113 | 114 | /** 115 | * Get the file name from the file path 116 | * @return string|null 117 | */ 118 | public function getFileName(): ?string 119 | { 120 | if (isset($this->file_path)) { 121 | return basename($this->file_path); 122 | } 123 | return null; 124 | } 125 | 126 | /** 127 | * Get file extension from the file path 128 | * @return string|null 129 | */ 130 | public function getFileExtension(): ?string 131 | { 132 | if (isset($this->file_path)) { 133 | return strtolower(pathinfo($this->file_path, PATHINFO_EXTENSION)); 134 | } 135 | return null; 136 | } 137 | 138 | /** 139 | * Get file mime type from the file 140 | * @return string|null 141 | */ 142 | public function getFileMimeType(): ?string 143 | { 144 | if (isset($this->file_path)) { 145 | return mime_content_type($this->file_path); 146 | } 147 | return null; 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/Textract.php: -------------------------------------------------------------------------------- 1 | run($path); 15 | $this->assertIsInt($output->word_count); 16 | $this->assertIsString($output->text); 17 | $this->assertNotEmpty($output->text); 18 | } 19 | 20 | public function addExtractionData(): array 21 | { 22 | return [ 23 | 'extracting doc' => [__DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'storage/example.xlsx'] 24 | ]; 25 | } 26 | } 27 | --------------------------------------------------------------------------------