├── .gitattributes
├── .gitignore
├── .php-cs-fixer.cache
├── LICENSE.md
├── README.md
├── _config.yml
├── blobs
    ├── danger.png
    ├── sticky-notes.png
    ├── ubuntu.png
    ├── warning.png
    └── windows.png
├── composer.json
├── composer.lock
├── config
    └── textract.php
├── index.md
├── lang
    └── en
    │   ├── extractor.php
    │   ├── file.php
    │   ├── processor.php
    │   └── tesseract.php
├── src
    ├── Concerns
    │   └── TextractOutput.php
    ├── Exceptions
    │   └── TextractException.php
    ├── ExtractorService
    │   ├── Contracts
    │   │   ├── AbstractExtractor.php
    │   │   ├── AbstractTextExtractor.php
    │   │   ├── HasPhpWord.php
    │   │   └── TextProcessorHaveFilter.php
    │   ├── ExtractorCommonProcessors
    │   │   ├── PhpPowerPointProcessor.php
    │   │   ├── PhpSheetProcessor.php
    │   │   ├── PhpWordProcessor.php
    │   │   └── TextProcessor.php
    │   ├── Extractors
    │   │   ├── HtmlExtractor.php
    │   │   ├── ImageExtractor.php
    │   │   ├── MsOfficeDocExtractor.php
    │   │   ├── MsOfficeDocxExtractor.php
    │   │   ├── MsOfficeExcelExtractor.php
    │   │   ├── MsOfficePptxExtractor.php
    │   │   ├── OpenOfficeDocument.php
    │   │   ├── OpenOfficeSpreadSheet.php
    │   │   ├── PdfExtractor.php
    │   │   ├── RtfExtractor.php
    │   │   └── TxtExtractor.php
    │   └── Ocr
    │   │   ├── Contracts
    │   │       └── TesseractOcrOptions.php
    │   │   └── TesseractOcrRun.php
    ├── Providers
    │   └── ServiceProvider.php
    ├── Services
    │   ├── ConsoleExtractionService.php
    │   ├── ExtractService.php
    │   └── UtilsService.php
    └── Textract.php
├── storage
    ├── example-multi-languages.png
    ├── example.doc
    ├── example.docx
    ├── example.epub
    ├── example.ods
    ├── example.odt
    ├── example.pdf
    ├── example.png
    ├── example.rtf
    ├── example.txt
    ├── example.xls
    ├── example.xlsx
    └── exmple-mix-ben.pdf
└── tests
    └── ExtractionTest.php


/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.doc linguist-detectable=false
 2 | *.docx linguist-detectable=false
 3 | *.ods linguist-detectable=false
 4 | *.odt linguist-detectable=false
 5 | *.pdf linguist-detectable=false
 6 | *.png linguist-detectable=false
 7 | *.rtf linguist-detectable=false
 8 | *.txt linguist-detectable=false
 9 | *.xls linguist-detectable=false
10 | *.xlsx linguist-detectable=false
11 | *.php linguist-detectable=true
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Composer template
 2 | composer.phar
 3 | .idea/*
 4 | /vendor/
 5 | 
 6 | # Commit your application's lock file https://getcomposer.org/doc/01-basic-usage.md#commit-your-composer-lock-file-to-version-control
 7 | # You may choose to ignore a library lock file http://getcomposer.org/doc/02-libraries.md#lock-file
 8 | # composer.lock
 9 | 
10 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | 
3 | Copyright 2022 | Niladri Shekhar Mondal
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Packagist](https://img.shields.io/packagist/v/nilgems/laravel-textract)](https://packagist.org/packages/nilgems/laravel-textract)
  2 | # Laravel Textract
  3 | A [Laravel](https://laravel.com) package to extract text from files like DOC, Excel, Image, Pdf and more.
  4 | 
  5 | # Versions and compatibility
  6 | 
  7 | - [Laravel 10](https://laravel.com) or higher is required.
  8 | - [Php 8.2]() or higher is required
  9 | 
 10 | ### Supported file formats
 11 | Following file formats is supported currently. You need to install proper extensions
 12 | to your server to work with all the following extension related files. The package will 
 13 | check file content MIME type before execute.
 14 | - **HTML**
 15 | - **TEXT**
 16 | - **DOC**
 17 | - **DOCX**
 18 | - **XLS**, **XLSX**, **XLSM**, **XLTX**, **XLTM**, **XLT**
 19 | - **CSV**
 20 | - **PDF**
 21 | - **Image**
 22 |   - _jpeg_
 23 |   - _png_
 24 |   - _gif_
 25 | - **ODT**
 26 | - **ODS**
 27 | - **RTF**
 28 | - **PPTX** (NEW)
 29 | 
 30 | **We are working hard to make this laravel plugin useful. If you found any issue please add a post on discussion.**
 31 | 
 32 | ### Installation
 33 | 
 34 | ``` 
 35 | composer require nilgems/laravel-textract
 36 | ```
 37 | Once installed you can do stuff like this:
 38 | ```
 39 | # Run the extractor
 40 | $output = Textract::run('/path/to/file.extension');
 41 | 
 42 | # Display the extracted text
 43 | echo $output->text;
 44 | 
 45 | # Display the extracted text word count
 46 | echo $output->word_count;
 47 | 
 48 | # Display the extracted text with direct string conversion
 49 | echo (string) $output;
 50 | ```
 51 | Run the extractor to any supported file:
 52 | ```
 53 | Textract::run(string $file_path, [string $job_id],[TesseractOcrOptions $extra_data]);
 54 | ```
 55 | |   Option    |           Type            |   Default value    | Required |                                                                                                          Description                                                                                                           |
 56 | |:-----------:|:-------------------------:|:------------------:|:--------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
 57 | | $file_path  |       ```String```        | _No default value_ |   Yes    |                                                                                              Text extractable file absolute path.                                                                                              |
 58 | |   $job_id   |       ```String```        |     ```NULL```     |    No    |                                                        It's a optional parameter. Extraction **job id**. If this option is blank the plugin will auto create the **ID**                                                        |
 59 | | $extra_data | ```TesseractOcrOptions``` |     ```NULL```     |    No    | It's a optional parameter. To pass extra parameter. If you are extracting a image file, you can mention languages and more by this ```Nilgems\PhpTextract\ExtractorService\Ocr\Contracts\TesseractOcrOptions``` **parameter**. |
 60 | 
 61 | ### Configuration
 62 | 
 63 | - You can add **provider** in ```app.php``` under the ```config``` folder of your
 64 | [Laravel](https://laravel.com) project. It's optional, the package automatically load the service provider in your application. 
 65 |   ```
 66 |   'providers' => [
 67 |     ...
 68 |     Nilgems\PhpTextract\Providers\ServiceProvider,
 69 |     ...
 70 |   ]
 71 |   ```
 72 | - Add **alias** in ```app.php``` under the ```config``` folder of your
 73 |   [Laravel](https://laravel.com) project. It's optional, the package automatically load the ```facade``` in your application.
 74 |   ```
 75 |   'aliases' => [
 76 |     ...
 77 |     'Textract' => Nilgems\PhpTextract\Textract::class,
 78 |     ...
 79 |   ]
 80 |   ```
 81 | - To publish the ```config``` file, run:
 82 |   ```
 83 |   php artisan vendor:publish --tag=textract
 84 |   ```
 85 | ### Example 
 86 | 
 87 | ##### Example 1: 
 88 | You can extract text from supported file format.
 89 | 
 90 | It is recommended to use the extractor with [Laravel Queue Job](https://laravel.com/docs/9.x/queues#creating-jobs) from better performance. <br /><br />
 91 | In ```php``` there have a restriction of execution time and memory limit defined in ```php.ini``` file with the option ```max_execution_time``` and ```memory_limit```. If file size is big, the process may kill forcefully when exceed the limit. You can use ```queue - database/redis``` or ```Laravel horizon``` to run the process in background.
 92 | ```
 93 | ........
 94 | Route::get('/textract', function(){
 95 |     return Textract::run('/path/to/image/example.png');
 96 | });
 97 | ........
 98 | ```
 99 | 
100 | ##### Example 2:
101 | If you need to specify languages in image file for better extraction output from image file.
102 | ```
103 | ........
104 | Route::get('/textract', function(){
105 |     return Textract::run('/path/to/image/example.png', null, [
106 |       'lang' => ['eng', 'jpn', 'spa']
107 |     ]);
108 | });
109 | ........
110 | ```
111 | ### Dependencies
112 | - To enable the image extraction feature you need to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
113 | - To enable the PDF extraction feature you need to install [pdftotext](http://www.xpdfreader.com/download.html)
114 | - To work properly, your server must have following php extensions installed -
115 |   - **ext-fileinfo**
116 |   - **ext-zip**
117 |   - **ext-gd** or **ext-imagick**
118 |   - **ext-xml**
119 | ### Tesseract OCR Installation
120 | #### <img src="https://raw.githubusercontent.com/NilGems/laravel-textract/master/blobs/ubuntu.png" width="12"  alt="Ubuntu" /> Ubuntu
121 | - Update the system: ```sudo apt update```
122 | - Add Tesseract OCR 5 PPA to your system: ```sudo add-apt-repository ppa:alex-p/tesseract-ocr-devel```
123 | - Install Tesseract on Ubuntu 20.04 | 18.04: ```sudo apt install -y tesseract-ocr```
124 | - Once installation is complete update your system: ```sudo apt update```
125 | - Verify the installation: ```tesseract --version```
126 | #### <img src="https://raw.githubusercontent.com/NilGems/laravel-textract/master/blobs/windows.png" width="12"  alt="Ubuntu" /> Windows
127 | - There are many [ways](https://github.com/tesseract-ocr/tesseract/wiki#windows) to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) on your system, but if you just want something quick to get up and running, I recommend installing the [Capture2Text](https://chocolatey.org/packages/capture2text) package with [Chocolatey](https://chocolatey.org/). 
128 | - Choco installation: ```choco install capture2text --version 5.0```
129 | 
130 | **Note: Recent versions of [Capture2Text](https://chocolatey.org/packages/capture2text) stopped shipping the ```tesseract``` binary**
131 | 
132 | ### PdfToText Installation
133 | #### <img src="https://raw.githubusercontent.com/NilGems/laravel-textract/master/blobs/ubuntu.png" width="12"  alt="Ubuntu" /> Ubuntu
134 | - Update the system: ```sudo apt update```
135 | - Install PdfToText on Ubuntu 20.04 | 18.04: ```sudo apt-get install poppler-utils```
136 | - Verify the installation: ```pdftotext -v```
137 | #### <img src="https://raw.githubusercontent.com/NilGems/laravel-textract/master/blobs/windows.png" width="12"  alt="Ubuntu" /> Windows
138 |   - Sorry but ```pdftotext``` available via [poppler](https://poppler.freedesktop.org/) and the [poppler](https://poppler.freedesktop.org/) is not available yet for windows. But you can install and [use the library by windows linux sub-system WLS](https://towardsdatascience.com/poppler-on-windows-179af0e50150). Alternatively, you can install [Laravel Homestead](https://laravel.com/docs/9.x/homestead) in your project and using vagrant virtualization you can run the project in ubuntu virtual server.
139 | 
140 | ## License
141 | 
142 | [MIT](https://choosealicense.com/licenses/mit/)
143 | 
144 | ---
145 | ## 💻 Tech Stack
146 | ![CSS3](https://img.shields.io/badge/css3-%231572B6.svg?style=plastic&logo=css3&logoColor=white) ![PHP](https://img.shields.io/badge/php-%23777BB4.svg?style=plastic&logo=php&logoColor=white) ![HTML5](https://img.shields.io/badge/html5-%23E34F26.svg?style=plastic&logo=html5&logoColor=white) ![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=plastic&logo=javascript&logoColor=%23F7DF1E) ![AWS](https://img.shields.io/badge/AWS-%23FF9900.svg?style=plastic&logo=amazon-aws&logoColor=white) ![Vue.js](https://img.shields.io/badge/vuejs-%2335495e.svg?style=plastic&logo=vuedotjs&logoColor=%234FC08D) ![Vuetify](https://img.shields.io/badge/Vuetify-1867C0?style=plastic&logo=vuetify&logoColor=AEDDFF) ![NPM](https://img.shields.io/badge/NPM-%23000000.svg?style=plastic&logo=npm&logoColor=white) ![jQuery](https://img.shields.io/badge/jquery-%230769AD.svg?style=plastic&logo=jquery&logoColor=white) ![Express.js](https://img.shields.io/badge/express.js-%23404d59.svg?style=plastic&logo=express&logoColor=%2361DAFB) ![Laravel](https://img.shields.io/badge/laravel-%23FF2D20.svg?style=plastic&logo=laravel&logoColor=white) ![NuxtJS](https://img.shields.io/badge/Nuxt-black?style=plastic&logo=nuxt.js&logoColor=white) ![Socket.io](https://img.shields.io/badge/Socket.io-black?style=plastic&logo=socket.io&badgeColor=010101) ![Apache](https://img.shields.io/badge/apache-%23D42029.svg?style=plastic&logo=apache&logoColor=white) ![MariaDB](https://img.shields.io/badge/MariaDB-003545?style=plastic&logo=mariadb&logoColor=white) ![MongoDB](https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=plastic&logo=mongodb&logoColor=white) ![MySQL](https://img.shields.io/badge/mysql-%2300f.svg?style=plastic&logo=mysql&logoColor=white) ![SQLite](https://img.shields.io/badge/sqlite-%2307405e.svg?style=plastic&logo=sqlite&logoColor=white) ![Inkscape](https://img.shields.io/badge/Inkscape-e0e0e0?style=plastic&logo=inkscape&logoColor=080A13) ![Jira](https://img.shields.io/badge/jira-%230A0FFF.svg?style=plastic&logo=jira&logoColor=white) ![Vagrant](https://img.shields.io/badge/vagrant-%231563FF.svg?style=plastic&logo=vagrant&logoColor=white)
147 | 
148 | ---
149 | [![](https://visitcount.itsvg.in/api?id=NilGems&icon=0&color=0)](https://visitcount.itsvg.in)
150 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-merlot


--------------------------------------------------------------------------------
/blobs/danger.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/danger.png


--------------------------------------------------------------------------------
/blobs/sticky-notes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/sticky-notes.png


--------------------------------------------------------------------------------
/blobs/ubuntu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/ubuntu.png


--------------------------------------------------------------------------------
/blobs/warning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/warning.png


--------------------------------------------------------------------------------
/blobs/windows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/windows.png


--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "nilgems/laravel-textract",
 3 |     "description": "A Laravel package to extract text from files like DOC, XL, Image, Pdf and more. I've developed this package by inspiring \"npm textract\".",
 4 |     "type": "library",
 5 |     "keywords": [
 6 |         "laravel",
 7 |         "plugin",
 8 |         "package",
 9 |         "text",
10 |         "textract",
11 |         "extract",
12 |         "html",
13 |         "csv",
14 |         "text",
15 |         "pdf",
16 |         "docx",
17 |         "doc",
18 |         "xls",
19 |         "xlsx",
20 |         "png",
21 |         "jpg",
22 |         "rtf",
23 |         "xml",
24 |         "odt",
25 |         "ott",
26 |         "xlsb",
27 |         "xlsm",
28 |         "xltx",
29 |         "ods"
30 |     ],
31 |     "require": {
32 |         "php": "^8.2",
33 |         "ext-fileinfo": "*",
34 |         "ext-zip": "*",
35 |         "ext-xml": "*",
36 |         "ext-gd": "*",
37 |         "symfony/process": "^6.4.3",
38 |         "phpoffice/phpspreadsheet": "^1.23",
39 |         "phpoffice/phpword": "^0.18",
40 |         "laravel/framework": "^12.0",
41 |         "thiagoalessio/tesseract_ocr": "^2.12",
42 |         "html2text/html2text": "^4.3",
43 |         "phpoffice/phppresentation": "^1.0"
44 |     },
45 |     "require-dev": {
46 |         "phpunit/phpunit": "^9.5"
47 |     },
48 |     "license": "MIT",
49 |     "autoload": {
50 |         "psr-4": {
51 |             "Nilgems\\PhpTextract\\": "src/"
52 |         }
53 |     },
54 |     "extra": {
55 |         "laravel": {
56 |             "providers": [
57 |                 "Nilgems\\PhpTextract\\Providers\\ServiceProvider"
58 |             ],
59 |             "aliases": {
60 |                 "Textract":"Nilgems\\PhpTextract\\Textract"
61 |             }
62 |         }
63 |     },
64 |     "authors": [
65 |         {
66 |             "name": "Niladri Shekhar Mondal",
67 |             "email": "nldrmondal35@gmail.com"
68 |         }
69 |     ],
70 |     "minimum-stability": "stable"
71 | }
72 | 


--------------------------------------------------------------------------------
/config/textract.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | return [
  4 | 
  5 |     /*
  6 |      | -----------------------------------------------------------------------------------------------------------------
  7 |      |  OCR configurations
  8 |      | -----------------------------------------------------------------------------------------------------------------
  9 |      |
 10 |      | Textract is using "Tesseract OCR" for OCR operations.
 11 |      | You can customize the OCR configuration form here. Generally don't need to change the configuration,
 12 |      | if you feel to do that please check the "Tesseract OCR"  documents before do any changes.
 13 |      | To know more details you can visit - https://github.com/thiagoalessio
 14 |      |
 15 |      */
 16 |     'ocr' => [
 17 |         /*
 18 |          | -------------------------------------------------------------------------------------------------------------
 19 |          |  OCR enabled or disabled:
 20 |          | -------------------------------------------------------------------------------------------------------------
 21 |          |
 22 |          |
 23 |          | Enable or disable the OCR functionality here. By default, the OCR is enabled and the code will check the plugin
 24 |          | is already installed or not in your server before do any operation. If the plugin is not installed/disabled the image
 25 |          | file extraction will not work.
 26 |          |
 27 |          */
 28 |         'enabled' => env('TEXTRACT_OCR_ENABLED', true),
 29 |         /*
 30 |          | -------------------------------------------------------------------------------------------------------------
 31 |          | OCR custom executable path
 32 |          | -------------------------------------------------------------------------------------------------------------
 33 |          |
 34 |          | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#executable
 35 |          |
 36 |          */
 37 |         'executable_path' => env('TEXTRACT_OCR_EXEC_PATH', null),
 38 | 
 39 |         /*
 40 |          | -------------------------------------------------------------------------------------------------------------
 41 |          |  OCR inducing recognition
 42 |          | -------------------------------------------------------------------------------------------------------------
 43 |          |
 44 |          |
 45 |          | By default, the value is 'null' and OCR will automatically recognise the text and try to extract whole text.
 46 |          | If you defined the path, the OCR will be able to extract those text that will match with the patterns inside
 47 |          | the text file.
 48 |          |
 49 |          |
 50 |          | Pattern example you can write inside the text file:
 51 |          | 1-\d\d\d-GOOG-441
 52 |          | www.\n\\\*.com
 53 |          |
 54 |          | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#inducing-recognition
 55 |          |
 56 |          */
 57 |         'text_patterns_path' => env('TEXTRACT_OCR_TEXT_PATTERNS_PATH', null),
 58 | 
 59 |         /*
 60 |          | -------------------------------------------------------------------------------------------------------------
 61 |          |  OCR thread limit
 62 |          | -------------------------------------------------------------------------------------------------------------
 63 |          |
 64 |          |
 65 |          | The value of limit will be a integer value. 0 - Mean all available thread.
 66 |          | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#thread-limit
 67 |          |
 68 |          */
 69 |         'thread_limit' => env('TEXTRACT_OCR_THREAD_LIMIT', 0),
 70 | 
 71 |         /*
 72 |          | -------------------------------------------------------------------------------------------------------------
 73 |          |  OCR custom dictionary text file path.
 74 |          | -------------------------------------------------------------------------------------------------------------
 75 |          |
 76 |          |
 77 |          | By default, the value is 'null'
 78 |          | Fore more details pleases visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#userpatterns
 79 |          |
 80 |          */
 81 |         'text_dictionary_path' => env('TEXTRACT_OCR_TEXT_DICTIONARY_PATH', null),
 82 | 
 83 |          /*
 84 |           |-------------------------------------------------------------------------------------------------------------
 85 |           | OCR other custom configurations
 86 |           |-------------------------------------------------------------------------------------------------------------
 87 |           |
 88 |           |
 89 |           | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#other-options
 90 |           |
 91 |          */
 92 |         'config' => [],
 93 | 
 94 |         /*
 95 |          |-------------------------------------------------------------------------------------------------------------
 96 |          | OCR Temporary file storage directory
 97 |          |-------------------------------------------------------------------------------------------------------------
 98 |          |
 99 |          | OCR custom temporary folder storage path. Make sure the path have proper permissions to access by PHP.
100 |          */
101 |         'temp_dir' => null
102 |     ]
103 | 
104 | ];
105 | 


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
  1 | [![Packagist](https://img.shields.io/packagist/v/nilgems/laravel-textract)](https://packagist.org/packages/nilgems/laravel-textract)
  2 | # Laravel Textract
  3 | A [Laravel](https://laravel.com) package to extract text from files like DOC, Excel, Image, Pdf and more.
  4 | 
  5 | # Versions and compatibility
  6 | 
  7 | - [Laravel 8](https://laravel.com) or higher is required.
  8 | - [Php 7.4]() or higher is required
  9 | 
 10 | ### <img src="./blobs/danger.png?raw=true" alt="Note" width="18"> [Laravel 9](https://laravel.com) support is added.
 11 | 
 12 | ### Supported file formats
 13 | Following file formats is supported currently. You need to install proper extensions
 14 | to your server to work with all the following extension related files. The package will
 15 | check file content MIME type before execute.
 16 | - **HTML**
 17 | - **TEXT**
 18 | - **DOC**
 19 | - **DOCX**
 20 | - **XLS**, **XLSX**, **XLSM**, **XLTX**, **XLTM**, **XLT**
 21 | - **CSV**
 22 | - **PDF**
 23 | - **Image**
 24 |     - _jpeg_
 25 |     - _png_
 26 | - **ODT**
 27 | - **ODS**
 28 | - **RTF**
 29 | 
 30 | <img src="./blobs/warning.png?raw=true" alt="Note" width="12">***GIF*** and ***PPT*** support is under development.
 31 | 
 32 | **We are working hard to make this laravel plugin useful. If you found any issue please add a post on discussion.**
 33 | 
 34 | ### Installation
 35 | 
 36 | ``` 
 37 | composer require nilgems/laravel-textract
 38 | ```
 39 | Once installed you can do stuff like this:
 40 | ```
 41 | # Run the extractor
 42 | $output = Textract::run('/path/to/file.extension');
 43 | 
 44 | # Display the extracted text
 45 | echo $output->text;
 46 | 
 47 | # Display the extracted text word count
 48 | echo $output->word_count;
 49 | 
 50 | # Display the extracted text with direct string conversion
 51 | echo (string) $output;
 52 | ```
 53 | Run the extractor to any supported file:
 54 | ```
 55 | Textract::run(string $file_path, [string $job_id],[array $extra_data]);
 56 | ```
 57 | |   Option    |  Type  |   Default value    | Required |                                                                                    Description                                                                                     |
 58 | |:-----------:|:------:|:------------------:|:--------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
 59 | | $file_path  | String | _No default value_ |   Yes    |                                                                        Text extractable file absolute path.                                                                        |
 60 | |   $job_id   | String |     ```NULL```     |    No    |                                  It's a optional parameter. Extraction **job id**. If this option is blank the plugin will auto create the **ID**                                  |
 61 | | $extra_data | array  |         []         |    No    | It's a optional parameter. To pass extra parameter. If you are extracting a image file, you can mention languages by this **parameter**. ``` ['lang' => ['eng', 'jpn', 'spa']] ``` |
 62 | 
 63 | ### Configuration
 64 | 
 65 | - You can add **provider** in ```app.php``` under the ```config``` folder of your
 66 |   [Laravel](https://laravel.com) project. It's optional, the package automatically load the service provider in your application.
 67 |   ```
 68 |   'providers' => [
 69 |     ...
 70 |     Nilgems\PhpTextract\Providers\ServiceProvider,
 71 |     ...
 72 |   ]
 73 |   ```
 74 | - Add **alias** in ```app.php``` under the ```config``` folder of your
 75 |   [Laravel](https://laravel.com) project. It's optional, the package automatically load the ```facade``` in your application.
 76 |   ```
 77 |   'aliases' => [
 78 |     ...
 79 |     'Textract' => Nilgems\PhpTextract\Textract::class,
 80 |     ...
 81 |   ]
 82 |   ```
 83 | ### Example
 84 | 
 85 | ##### Example 1:
 86 | You can extract text from supported file format.
 87 | 
 88 | It is recommended to use the extractor with [Laravel Queue Job](https://laravel.com/docs/9.x/queues#creating-jobs) from better performance. <br /><br />
 89 | In ```php``` there have a restriction of execution time and memory limit defined in ```php.ini``` file with the option ```max_execution_time``` and ```memory_limit```. If file size is big, the process may kill forcefully when exceed the limit. You can use ```queue - database/redis``` or ```Laravel horizon``` to run the process in background.
 90 | ```
 91 | ........
 92 | Route::get('/textract', function(){
 93 |     return Textract::run('/path/to/image/example.png');
 94 | });
 95 | ........
 96 | ```
 97 | 
 98 | ##### Example 2:
 99 | If you need to specify languages in image file for better extraction output from image file.
100 | ```
101 | ........
102 | Route::get('/textract', function(){
103 |     return Textract::run('/path/to/image/example.png', null, [
104 |       'lang' => ['eng', 'jpn', 'spa']
105 |     ]);
106 | });
107 | ........
108 | ```
109 | ### Dependencies
110 | - To enable the image extraction feature you need to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
111 | - To enable the PDF extraction feature you need to install [pdftotext](http://www.xpdfreader.com/download.html)
112 | - To work properly, your server must have following php extensions installed -
113 |     - **ext-fileinfo**
114 |     - **ext-zip**
115 |     - **ext-gd** or **ext-imagick**
116 |     - **ext-xml**
117 | ### Tesseract OCR Installation
118 | #### <img src="https://raw.githubusercontent.com/NilGems/laravel-textract/master/blobs/ubuntu.png" width="12"  alt="Ubuntu" /> Ubuntu
119 | - Update the system: ```sudo apt update```
120 | - Add Tesseract OCR 5 PPA to your system: ```sudo add-apt-repository ppa:alex-p/tesseract-ocr-devel```
121 | - Install Tesseract on Ubuntu 20.04 | 18.04: ```sudo apt install -y tesseract-ocr```
122 | - Once installation is complete update your system: ```sudo apt update```
123 | - Verify the installation: ```tesseract --version```
124 | #### <img src="https://raw.githubusercontent.com/NilGems/laravel-textract/master/blobs/windows.png" width="12"  alt="Ubuntu" /> Windows
125 | - There are many [ways](https://github.com/tesseract-ocr/tesseract/wiki#windows) to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) on your system, but if you just want something quick to get up and running, I recommend installing the [Capture2Text](https://chocolatey.org/packages/capture2text) package with [Chocolatey](https://chocolatey.org/).
126 | - Choco installation: ```choco install capture2text --version 5.0```
127 | 
128 | **Note: Recent versions of [Capture2Text](https://chocolatey.org/packages/capture2text) stopped shipping the ```tesseract``` binary**
129 | 
130 | ### PdfToText Installation
131 | #### <img src="https://raw.githubusercontent.com/NilGems/laravel-textract/master/blobs/ubuntu.png" width="12"  alt="Ubuntu" /> Ubuntu
132 | - Update the system: ```sudo apt update```
133 | - Install PdfToText on Ubuntu 20.04 | 18.04: ```sudo apt-get install poppler-utils```
134 | - Verify the installation: ```pdftotext -v```
135 | #### <img src="https://raw.githubusercontent.com/NilGems/laravel-textract/master/blobs/windows.png" width="12"  alt="Ubuntu" /> Windows
136 | - Sorry but ```pdftotext``` available via [poppler](https://poppler.freedesktop.org/) and the [poppler](https://poppler.freedesktop.org/) is not available yet for windows. But you can install and [use the library by windows linux sub-system WLS](https://towardsdatascience.com/poppler-on-windows-179af0e50150). Alternatively, you can install [Laravel Homestead](https://laravel.com/docs/9.x/homestead) in your project and using vagrant virtualization you can run the project in ubuntu virtual server.
137 | 
138 | ## License
139 | 
140 | [MIT](https://choosealicense.com/licenses/mit/)
141 | 
142 | ---
143 | ## 💻 Tech Stack
144 | ![CSS3](https://img.shields.io/badge/css3-%231572B6.svg?style=plastic&logo=css3&logoColor=white) ![PHP](https://img.shields.io/badge/php-%23777BB4.svg?style=plastic&logo=php&logoColor=white) ![HTML5](https://img.shields.io/badge/html5-%23E34F26.svg?style=plastic&logo=html5&logoColor=white) ![JavaScript](https://img.shields.io/badge/javascript-%23323330.svg?style=plastic&logo=javascript&logoColor=%23F7DF1E) ![AWS](https://img.shields.io/badge/AWS-%23FF9900.svg?style=plastic&logo=amazon-aws&logoColor=white) ![Vue.js](https://img.shields.io/badge/vuejs-%2335495e.svg?style=plastic&logo=vuedotjs&logoColor=%234FC08D) ![Vuetify](https://img.shields.io/badge/Vuetify-1867C0?style=plastic&logo=vuetify&logoColor=AEDDFF) ![NPM](https://img.shields.io/badge/NPM-%23000000.svg?style=plastic&logo=npm&logoColor=white) ![jQuery](https://img.shields.io/badge/jquery-%230769AD.svg?style=plastic&logo=jquery&logoColor=white) ![Express.js](https://img.shields.io/badge/express.js-%23404d59.svg?style=plastic&logo=express&logoColor=%2361DAFB) ![Laravel](https://img.shields.io/badge/laravel-%23FF2D20.svg?style=plastic&logo=laravel&logoColor=white) ![NuxtJS](https://img.shields.io/badge/Nuxt-black?style=plastic&logo=nuxt.js&logoColor=white) ![Socket.io](https://img.shields.io/badge/Socket.io-black?style=plastic&logo=socket.io&badgeColor=010101) ![Apache](https://img.shields.io/badge/apache-%23D42029.svg?style=plastic&logo=apache&logoColor=white) ![MariaDB](https://img.shields.io/badge/MariaDB-003545?style=plastic&logo=mariadb&logoColor=white) ![MongoDB](https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=plastic&logo=mongodb&logoColor=white) ![MySQL](https://img.shields.io/badge/mysql-%2300f.svg?style=plastic&logo=mysql&logoColor=white) ![SQLite](https://img.shields.io/badge/sqlite-%2307405e.svg?style=plastic&logo=sqlite&logoColor=white) ![Inkscape](https://img.shields.io/badge/Inkscape-e0e0e0?style=plastic&logo=inkscape&logoColor=080A13) ![Jira](https://img.shields.io/badge/jira-%230A0FFF.svg?style=plastic&logo=jira&logoColor=white) ![Vagrant](https://img.shields.io/badge/vagrant-%231563FF.svg?style=plastic&logo=vagrant&logoColor=white)
145 | 
146 | ---
147 | [![](https://visitcount.itsvg.in/api?id=NilGems&icon=0&color=0)](https://visitcount.itsvg.in)
148 | 


--------------------------------------------------------------------------------
/lang/en/extractor.php:
--------------------------------------------------------------------------------
1 | <?php
2 | return [
3 |     'error_mime_mismatch' => 'The content of file in {path} path with extension .{extension} is|are match with MIME type {mime_types}',
4 |     'error_supported_extension_not_defined' => 'Supported extension is not defined in extractor.',
5 |     'error_pdf_of_extension_not_installed' => '\'pdftotext\' does not appear to be installed. Please check the documentation - https://github.com/NilGems/laravel-textract#pdftotext-installation'
6 | ];
7 | 


--------------------------------------------------------------------------------
/lang/en/file.php:
--------------------------------------------------------------------------------
1 | <?php
2 | return [
3 |     'error_not_exists' => 'The provided file path is invalid, any file is not available in {path} path. Please can check the file have proper permission or exists.'
4 | ];
5 | 


--------------------------------------------------------------------------------
/lang/en/processor.php:
--------------------------------------------------------------------------------
1 | <?php
2 | return [
3 |     'error_unable_to_read' => 'The provided file of {path} path is not readable.'
4 | ];
5 | 


--------------------------------------------------------------------------------
/lang/en/tesseract.php:
--------------------------------------------------------------------------------
1 | <?php
2 | return [
3 |     'error_not_installed' => '\'tesseract\' does not appear to be installed. Please check the document - https://github.com/NilGems/laravel-textract#tesseract-ocr-installation',
4 |     'error_file_extension_txt_required' => 'Only \'text\' file is supported. Please provide file with .txt extension.',
5 |     'error_input_invalid' => 'Invalid value is provided'
6 | ];


--------------------------------------------------------------------------------
/src/Concerns/TextractOutput.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\Concerns;
 4 | 
 5 | use Illuminate\Contracts\Support\Arrayable;
 6 | use Illuminate\Support\Collection;
 7 | 
 8 | /**
 9 |  * @property-read string $text
10 |  * @property-read string $word_count
11 |  */
12 | class TextractOutput implements Arrayable
13 | {
14 |     protected Collection $collection;
15 | 
16 |     /**
17 |      * @param string $raw_output
18 |      */
19 |     public function __construct(string $raw_output)
20 |     {
21 |         $this->collection = new Collection([
22 |             'text' => htmlspecialchars($raw_output, ENT_NOQUOTES, "UTF-8"),
23 |             'word_count' => str_word_count(utf8_decode($raw_output), 0)
24 |         ]);
25 |     }
26 | 
27 |     /**
28 |      * To array
29 |      * @return array
30 |      */
31 |     public function toArray(): array
32 |     {
33 |         return $this->collection->toArray();
34 |     }
35 | 
36 |     public function __get(string $key)
37 |     {
38 |         return $this->collection->get($key);
39 |     }
40 | 
41 |     public function __set(string $key, string $value)
42 |     {
43 |         $this->collection->put($key, $value);
44 |     }
45 | 
46 |     public function __isset(string $key)
47 |     {
48 |         return $this->collection->has($key);
49 |     }
50 | 
51 |     /**
52 |      * To string
53 |      * @return string
54 |      */
55 |     public function __toString(): string
56 |     {
57 |         return $this->collection->get('text');
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/Exceptions/TextractException.php:
--------------------------------------------------------------------------------
1 | <?php
2 | 
3 | namespace Nilgems\PhpTextract\Exceptions;
4 | 
5 | class TextractException extends \Exception
6 | {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Contracts/AbstractExtractor.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Nilgems\PhpTextract\ExtractorService\Contracts;
  4 | 
  5 | use Illuminate\Support\Collection;
  6 | use Nilgems\PhpTextract\Exceptions\TextractException;
  7 | use Nilgems\PhpTextract\Services\UtilsService;
  8 | 
  9 | abstract class AbstractExtractor
 10 | {
 11 |     /**
 12 |      * @var string $file_path
 13 |      */
 14 |     protected string $file_path = "";
 15 |     /**
 16 |      * @var string $error_message
 17 |      */
 18 |     protected string $error_message = "The extractor plugin is not installed in the system. Please install and try again.";
 19 |     /**
 20 |      * ExtractorService name
 21 |      * @var string $extractor_name
 22 |      */
 23 |     protected string $extractor_name = 'The extractor';
 24 | 
 25 |     protected array $extractor_supported_extension = [];
 26 |     /**
 27 |      * @var array $mime_accepts
 28 |      */
 29 |     protected array $mime_accepts = [];
 30 |     /**
 31 |      * @var string $current_mime_type
 32 |      */
 33 |     protected string $current_mime_type = "";
 34 |     /**
 35 |      * @var Collection $data
 36 |      */
 37 |     protected Collection $data;
 38 | 
 39 |     public function __construct()
 40 |     {
 41 |         $this->data = new Collection([]);
 42 |     }
 43 | 
 44 |     /**
 45 |      * Set data
 46 |      * @param $key
 47 |      * @param $value
 48 |      * @return $this
 49 |      */
 50 |     public function setData($key, $value): self
 51 |     {
 52 |         $this->data->put($key, $value);
 53 |         return $this;
 54 |     }
 55 |     /**
 56 |      * Get accept mime types
 57 |      * @return array
 58 |      */
 59 |     public function getAcceptMimeTypes(): array
 60 |     {
 61 |         if (method_exists($this, 'mimeAccepts')) {
 62 |             return $this->mimeAccepts();
 63 |         }
 64 |         return $this->mime_accepts;
 65 |     }
 66 | 
 67 |     /**
 68 |      * Get acceptable extensions
 69 |      * @return array
 70 |      */
 71 |     public function getAcceptExtensions(): array
 72 |     {
 73 |         return $this->extractor_supported_extension;
 74 |     }
 75 |     /**
 76 |      * Has match mime type
 77 |      * @param string $mime_type
 78 |      * @return bool
 79 |      */
 80 |     public function hasMatchMimeType(string $mime_type): bool
 81 |     {
 82 |         $acceptable_mime_type = $this->getAcceptMimeTypes();
 83 |         if (empty($acceptable_mime_type)) {
 84 |             return true;
 85 |         }
 86 |         return in_array(strtolower($mime_type), $acceptable_mime_type, true);
 87 |     }
 88 | 
 89 |     /**
 90 |      * @param string $file_path
 91 |      * @param array $data
 92 |      * @return string|null
 93 |      * @throws TextractException
 94 |      */
 95 |     public function boot(string $file_path, array $data = []): ?string
 96 |     {
 97 |         $this->file_path = $file_path;
 98 |         $this->data = $this->data->merge($data);
 99 |         $utilsService = app(UtilsService::class)->setFilePath($file_path);
100 |         $utilsService->setFilePath($file_path);
101 |         $this->current_mime_type = $utilsService->getFileMimeType();
102 |         if (!$this->hasMatchMimeType($this->current_mime_type)) {
103 |             throw new TextractException(
104 |                 $this->extractor_name .
105 |                 ' unable to process the file. Please ensure the content of file is a ' .
106 |                 implode('/', $this->extractor_supported_extension) . 'file.'
107 |             );
108 |         }
109 |         $has_valid = $this->checkHaveProviderPackage();
110 |         if ($has_valid) {
111 |             return $this->getTextFromFile();
112 |         }
113 |         throw new TextractException($this->error_message);
114 |     }
115 | 
116 |     abstract protected function checkHaveProviderPackage();
117 | 
118 |     abstract protected function getTextFromFile();
119 | }
120 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Contracts/AbstractTextExtractor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Contracts;
 4 | 
 5 | use Illuminate\Support\Str;
 6 | use Nilgems\PhpTextract\Exceptions\TextractException;
 7 | use Nilgems\PhpTextract\Services\UtilsService;
 8 | 
 9 | abstract class AbstractTextExtractor
10 | {
11 |     /**
12 |      * Extractor utils service
13 |      * @var UtilsService $utilsService
14 |      */
15 |     protected UtilsService $utilsService;
16 |     /**
17 |      * Extractor supported extension
18 |      * @var array $supported_extension
19 |      */
20 |     public array $supported_extension = [];
21 |     /**
22 |      * Extractor supported mime types
23 |      * @var array $supported_mime_types
24 |      */
25 |     protected array $supported_mime_types = [];
26 | 
27 |     /**
28 |      * Run the extractor and get the output
29 |      * @param UtilsService $utilsService
30 |      * @return string
31 |      * @throws TextractException
32 |      */
33 |     public function boot(UtilsService $utilsService): string
34 |     {
35 |         $this->utilsService = $utilsService;
36 |         if ($this->hasSupportedExtensionDefined() && $this->utilsService->getFilePath() && $this->hasMatchMimeType()) {
37 |             return $this->getExtractedText();
38 |         }
39 |         return "";
40 |     }
41 | 
42 |     /**
43 |      * Check the supported file format is defined or not.
44 |      * @return bool
45 |      * @throws TextractException
46 |      */
47 |     private function hasSupportedExtensionDefined(): bool
48 |     {
49 |         if (!empty($this->supported_extension)) {
50 |             return true;
51 |         }
52 |         throw new TextractException(trans('textract::extractor.error_supported_extension_not_defined'));
53 |     }
54 | 
55 |     /**
56 |      * Check the mime type of file provided via path is match or not
57 |      * @return bool
58 |      * @throws TextractException
59 |      */
60 |     private function hasMatchMimeType(): bool
61 |     {
62 |         $current_file_mime_type = strtolower($this->utilsService->getFileMimeType());
63 |         $is_match_mime_type = collect($this->supported_mime_types)
64 |             ->transform(function ($mime_type) {
65 |                 return strtolower($mime_type);
66 |             })
67 |             ->filter(function ($mime_type) use ($current_file_mime_type) {
68 |                 return Str::of($mime_type)->exactly($current_file_mime_type);
69 |             })
70 |             ->count() > 0;
71 |         if (!$is_match_mime_type) {
72 |             throw new TextractException(trans_choice('textract::extractor.error_mime_mismatch', count($this->supported_extension), [
73 |                 'path' => $this->utilsService->getFilePath(),
74 |                 'extension' => implode(', .', $this->supported_extension),
75 |                 'mime_types' => implode(', ', $this->supported_mime_types)
76 |             ]));
77 |         }
78 |         return true;
79 |     }
80 | 
81 |     abstract protected function getExtractedText(): string;
82 | }
83 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Contracts/HasPhpWord.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Contracts;
 4 | 
 5 | use PhpOffice\PhpWord\Element\Text as PhpWordElementText;
 6 | use PhpOffice\PhpWord\Element\TextRun as PhpWordElementTextRun;
 7 | use PhpOffice\PhpWord\IOFactory;
 8 | 
 9 | trait HasPhpWord
10 | {
11 |     /**
12 |      * @param string $file_path
13 |      * @param string $readerName
14 |      * @return string
15 |      */
16 |     protected function getSectionsText(string $file_path, string $readerName = 'Word2007'): string
17 |     {
18 |         $data = [];
19 |         $phpWord = IOFactory::load($file_path, $readerName);
20 |         foreach ($phpWord->getSections() as $section) {
21 |             $elements = $section->getElements();
22 |             $data  = [...$data, ...$this->getElementText($elements)];
23 |         }
24 |         return implode(" ", array_filter($data));
25 |     }
26 | 
27 |     /**
28 |      * @param array $elements
29 |      * @return array
30 |      */
31 |     protected function getElementText(array $elements): array
32 |     {
33 |         $docs = [];
34 |         foreach ($elements as $element) {
35 |             if ($element instanceof PhpWordElementText) {
36 |                 $docs[] = trim($element->getText());
37 |             }
38 |             if ($element instanceof PhpWordElementTextRun) {
39 |                 $nested_data = $this->getElementText($element->getElements());
40 |                 $docs = [...$docs, ...$nested_data];
41 |             }
42 |         }
43 |         return $docs;
44 |     }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Contracts/TextProcessorHaveFilter.php:
--------------------------------------------------------------------------------
1 | <?php
2 | 
3 | namespace Nilgems\PhpTextract\ExtractorService\Contracts;
4 | 
5 | interface TextProcessorHaveFilter
6 | {
7 |     public function getFilteredText(string $output): string;
8 | }


--------------------------------------------------------------------------------
/src/ExtractorService/ExtractorCommonProcessors/PhpPowerPointProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors;
 4 | 
 5 | use Nilgems\PhpTextract\Exceptions\TextractException;
 6 | use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractTextExtractor;
 7 | use PhpOffice\PhpPresentation\IOFactory;
 8 | use PhpOffice\PhpPresentation\Shape;
 9 | 
10 | /**
11 |  * PHP PowerPointProcessor
12 |  * Read the document: https://phpoffice.github.io/PHPPresentation/usage/readers.html
13 |  */
14 | class PhpPowerPointProcessor extends AbstractTextExtractor
15 | {
16 |     protected string $readerType = "PowerPoint2007";
17 |     private function hasReadable(): bool
18 |     {
19 |         $file_path = $this->utilsService->getFilePath();
20 |         $reader = IOFactory::createReader($this->readerType);
21 |         try {
22 |             $presentation = $reader->load($file_path);
23 |             return count($presentation->getAllSlides()) > 0;
24 |         } catch (\Exception $exception) {
25 |             throw $exception;
26 |             report($exception);
27 |             throw new TextractException(trans('textract::processor.error_unable_to_read', [
28 |                 'path' => $this->utilsService->getFilePath()
29 |             ]));
30 |         }
31 | 
32 |     }
33 | 
34 |     /**
35 |      * @throws TextractException
36 |      * @throws \PhpOffice\PhpSpreadsheet\Reader\Exception
37 |      */
38 |     protected function getExtractedText(): string
39 |     {
40 |         if ($this->hasReadable()) {
41 |             $data_iterable = [];
42 |             $reader = IOFactory::createReader($this->readerType);
43 |             $presentation = $reader->load($this->utilsService->getFilePath());
44 |             foreach ($presentation->getAllSlides() as $slide) {
45 |                 $shapes = $slide->getShapeCollection();
46 |                 foreach ($shapes as $shape_k => $shape_v) {
47 |                     $shape = $shapes[$shape_k];
48 |                     if($shape instanceof Shape\RichText){
49 |                         $paragraphs = $shapes[$shape_k]->getParagraphs();
50 |                         foreach ($paragraphs as $paragraph_k => $paragraph_v) {
51 |                             $text_elements = $paragraph_v->getRichTextElements();
52 |                             foreach ($text_elements as $text_element_k => $text_element_v) {
53 |                                 $data_iterable[] = $text_element_v->getText();
54 |                             }
55 |                         }
56 |                     }
57 |                 }
58 |             }
59 |             return implode("\n", $data_iterable);
60 |         }
61 |         return "";
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/ExtractorService/ExtractorCommonProcessors/PhpSheetProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors;
 4 | 
 5 | use Nilgems\PhpTextract\Exceptions\TextractException;
 6 | use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractTextExtractor;
 7 | use PhpOffice\PhpSpreadsheet\IOFactory;
 8 | 
 9 | class PhpSheetProcessor extends AbstractTextExtractor
10 | {
11 |     /**
12 |      * @return bool
13 |      * @throws TextractException
14 |      * @throws \PhpOffice\PhpSpreadsheet\Reader\Exception
15 |      */
16 |     private function hasReadable(): bool
17 |     {
18 |         $file_path = $this->utilsService->getFilePath();
19 |         $has_readable = IOFactory::createReaderForFile($file_path)->canRead($file_path);
20 |         if ($has_readable) {
21 |             return true;
22 |         }
23 |         throw new TextractException(trans('textract::processor.error_unable_to_read', [
24 |             'path' => $this->utilsService->getFilePath()
25 |         ]));
26 |     }
27 | 
28 |     /**
29 |      * @throws TextractException
30 |      * @throws \PhpOffice\PhpSpreadsheet\Reader\Exception
31 |      */
32 |     protected function getExtractedText(): string
33 |     {
34 |         if ($this->hasReadable()) {
35 |             $data_iterable = [];
36 |             $spreadsheet = IOFactory::load($this->utilsService->getFilePath());
37 |             foreach ($spreadsheet->getAllSheets() as $sheet) {
38 |                 foreach ($sheet->toArray() as $item) {
39 |                     $data_iterable[] = implode(',', array_filter($item));
40 |                 }
41 |             }
42 |             return implode("\n", $data_iterable);
43 |         }
44 |         return "";
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/ExtractorService/ExtractorCommonProcessors/PhpWordProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors;
 4 | 
 5 | use Nilgems\PhpTextract\Exceptions\TextractException;
 6 | use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractTextExtractor;
 7 | use PhpOffice\PhpWord\Element\Text as PhpWordElementText;
 8 | use PhpOffice\PhpWord\Element\TextRun as PhpWordElementTextRun;
 9 | use PhpOffice\PhpWord\Exception\Exception as PhpWordException;
10 | use PhpOffice\PhpWord\IOFactory;
11 | 
12 | class PhpWordProcessor extends AbstractTextExtractor
13 | {
14 |     protected string $reader_name = 'Word2007';
15 | 
16 |     /**
17 |      * @throws TextractException
18 |      * @throws PhpWordException|TextractException
19 |      */
20 |     protected function getExtractedText(): string
21 |     {
22 |         if ($this->hasReadable()) {
23 |             return $this->getSectionsText();
24 |         }
25 |         return "";
26 |     }
27 | 
28 |     /**
29 |      * Has the file is readable
30 |      * @return bool
31 |      * @throws TextractException
32 |      * @throws PhpWordException|TextractException
33 |      */
34 |     private function hasReadable(): bool
35 |     {
36 |         $has_read_permission = IOFactory::createReader($this->reader_name)
37 |             ->canRead($this->utilsService->getFilePath());
38 |         if ($has_read_permission) {
39 |             return true;
40 |         }
41 |         throw new TextractException(trans('textract::processor.error_unable_to_read', [
42 |             'path' => $this->utilsService->getFilePath()
43 |         ]));
44 |     }
45 | 
46 |     /**
47 |      * Collect section wise text from the Word file
48 |      * @return string
49 |      * @throws TextractException
50 |      */
51 |     protected function getSectionsText(): string
52 |     {
53 |         $output = [];
54 |         $phpWord = IOFactory::load($this->utilsService->getFilePath(), $this->reader_name);
55 |         foreach ($phpWord->getSections() as $section) {
56 |             $elements = $section->getElements();
57 |             $output[] = $this->getElementText($elements);
58 |         }
59 |         return implode(" ", array_filter($output));
60 |     }
61 | 
62 |     /**
63 |      * @param array $elements
64 |      * @return string
65 |      */
66 |     protected function getElementText(array $elements): string
67 |     {
68 |         $output = [];
69 |         foreach ($elements as $element) {
70 |             if ($element instanceof PhpWordElementText) {
71 |                 $output[] = trim($element->getText());
72 |             }
73 |             if ($element instanceof PhpWordElementTextRun) {
74 |                 $output[] = $this->getElementText($element->getElements());
75 |             }
76 |         }
77 |         return implode(" ", $output);
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/ExtractorService/ExtractorCommonProcessors/TextProcessor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors;
 4 | 
 5 | use Nilgems\PhpTextract\Exceptions\TextractException;
 6 | use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractTextExtractor;
 7 | use Nilgems\PhpTextract\ExtractorService\Contracts\TextProcessorHaveFilter;
 8 | 
 9 | class TextProcessor extends AbstractTextExtractor
10 | {
11 |     /**
12 |      * @return string
13 |      * @throws TextractException
14 |      */
15 |     protected function getExtractedText(): string
16 |     {
17 |         if ($file_resource = $this->hasReadable()) {
18 |             $file_size = filesize($this->utilsService->getFilePath());
19 |             $read_data = fread($file_resource, $file_size);
20 |             fclose($file_resource);
21 |             if ($this instanceof TextProcessorHaveFilter) {
22 |                 return $this->getExtractedText($read_data);
23 |             }
24 |             return $read_data;
25 |         }
26 |         return "";
27 |     }
28 | 
29 |     /**
30 |      * @return resource
31 |      * @throws TextractException
32 |      */
33 |     private function hasReadable()
34 |     {
35 |         if ($file_resource = fopen($this->utilsService->getFilePath(), 'rb')) {
36 |             return $file_resource;
37 |         }
38 |         throw new TextractException(trans('textract::processor.error_unable_to_read', [
39 |             'path' => $this->utilsService->getFilePath()
40 |         ]));
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/HtmlExtractor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Html2Text\Html2Text;
 6 | use Nilgems\PhpTextract\ExtractorService\Contracts\TextProcessorHaveFilter;
 7 | use Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors\TextProcessor;
 8 | 
 9 | class HtmlExtractor extends TextProcessor implements TextProcessorHaveFilter
10 | {
11 |     protected array $supported_mime_types = [
12 |         'text/html'
13 |     ];
14 | 
15 |     public array $supported_extension = ['html', 'htm'];
16 | 
17 |     /**
18 |      * Remove the tags from the output text
19 |      * @param string $output
20 |      * @return string
21 |      */
22 |     public function getFilteredText(string $output): string
23 |     {
24 |         if (!empty($output)) {
25 |             return (new Html2Text($output))->getText();
26 |         }
27 |         return "";
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/ImageExtractor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractTextExtractor;
 6 | use Nilgems\PhpTextract\ExtractorService\Ocr\Contracts\TesseractOcrOptions;
 7 | use Nilgems\PhpTextract\ExtractorService\Ocr\TesseractOcrRun;
 8 | 
 9 | class ImageExtractor extends AbstractTextExtractor
10 | {
11 |     protected array $supported_mime_types = [
12 |         'image/jpeg',
13 |         'image/gif',
14 |         'image/png'
15 |     ];
16 | 
17 |     public array $supported_extension = [
18 |         'jpg',
19 |         'jpeg',
20 |         'png',
21 |         'gif'
22 |     ];
23 | 
24 |     protected ?TesseractOcrOptions $ocrOptions = null;
25 | 
26 |     /**
27 |      * @param TesseractOcrOptions $ocrOptions
28 |      * @return $this
29 |      */
30 |     public function setOcrOptions(TesseractOcrOptions $ocrOptions): self
31 |     {
32 |         $this->ocrOptions = $ocrOptions;
33 |         return $this;
34 |     }
35 | 
36 |     /**
37 |      * @return string
38 |      * @throws \Nilgems\PhpTextract\Exceptions\TextractException
39 |      * @throws \thiagoalessio\TesseractOCR\TesseractOcrException
40 |      */
41 |     protected function getExtractedText(): string
42 |     {
43 |         return app(TesseractOcrRun::class)
44 |             ->boot($this->utilsService, $this->ocrOptions);
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/MsOfficeDocExtractor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors\PhpWordProcessor;
 6 | 
 7 | class MsOfficeDocExtractor extends PhpWordProcessor
 8 | {
 9 |     protected array $supported_mime_types = [
10 |         'application/msword'
11 |     ];
12 | 
13 |     public array $supported_extension = ['doc'];
14 | 
15 |     protected string $reader_name = 'MsDoc';
16 | }


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/MsOfficeDocxExtractor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors\PhpWordProcessor;
 6 | 
 7 | class MsOfficeDocxExtractor extends PhpWordProcessor
 8 | {
 9 |     protected array $supported_mime_types = [
10 |         'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
11 |     ];
12 | 
13 |     public array $supported_extension = ['docx'];
14 | }


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/MsOfficeExcelExtractor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors\PhpSheetProcessor;
 6 | 
 7 | class MsOfficeExcelExtractor extends PhpSheetProcessor
 8 | {
 9 |     protected array $supported_mime_types = [
10 |         'application/vnd.ms-excel',
11 |         'application/vnd.ms-excel.sheet.binary.macroEnabled.12',
12 |         'application/vnd.ms-excel.sheet.macroEnabled.12',
13 |         'application/vnd.ms-excel.template.macroenabled.12',
14 |         'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
15 |         'application/vnd.openxmlformats-officedocument.spreadsheetml.template'
16 |     ];
17 | 
18 |     public array $supported_extension = [
19 |         'xls',
20 |         'xlsb',
21 |         'xlsm',
22 |         'xltm',
23 |         'xlsx',
24 |         'xltx'
25 |     ];
26 | }
27 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/MsOfficePptxExtractor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors\PhpPowerPointProcessor;
 6 | 
 7 | class MsOfficePptxExtractor extends PhpPowerPointProcessor
 8 | {
 9 |     protected string $readerType = "PowerPoint2007";
10 | 
11 |     protected array $supported_mime_types = [
12 |         'application/vnd.openxmlformats-officedocument.presentationml.presentation'
13 |     ];
14 | 
15 |     public array $supported_extension = ['pptx'];
16 | 
17 |     protected string $reader_name = 'MsPresentation';
18 | }
19 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/OpenOfficeDocument.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors\PhpWordProcessor;
 6 | 
 7 | class OpenOfficeDocument extends PhpWordProcessor
 8 | {
 9 |     protected string $reader_name = 'ODText';
10 | 
11 |     protected array $supported_mime_types = [
12 |         'application/vnd.oasis.opendocument.text',
13 |         'application/vnd.oasis.opendocument.text-template'
14 |     ];
15 | 
16 |     public array $supported_extension = ['odt', 'ott'];
17 | }


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/OpenOfficeSpreadSheet.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors\PhpSheetProcessor;
 6 | 
 7 | class OpenOfficeSpreadSheet extends PhpSheetProcessor
 8 | {
 9 |     protected array $supported_mime_types = [
10 |         'application/vnd.oasis.opendocument.spreadsheet',
11 |         'application/vnd.oasis.opendocument.spreadsheet-template'
12 |     ];
13 | 
14 |     public array $supported_extension = ['ods', 'ots'];
15 | }


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/PdfExtractor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Nilgems\PhpTextract\Exceptions\TextractException;
 6 | use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractTextExtractor;
 7 | use Symfony\Component\Process\Process;
 8 | 
 9 | class PdfExtractor extends AbstractTextExtractor
10 | {
11 |     protected array $supported_mime_types = [
12 |         'application/pdf'
13 |     ];
14 | 
15 |     public array $supported_extension = ['pdf'];
16 | 
17 |     /**
18 |      * @return string
19 |      * @throws TextractException
20 |      */
21 |     protected function getExtractedText(): string
22 |     {
23 |         if ($this->hasOsExtensionInstalled()) {
24 |             $file_path = $this->utilsService->getFilePath();
25 |             $process = new Process(['pdftotext', '-layout', $file_path , '-']);
26 |             $process->start();
27 |             $process->wait();
28 |             return $this->getFilteredOutput($process);
29 |         }
30 |         return "";
31 |     }
32 | 
33 |     /**
34 |      * Has 'pdftotext' extension is installed or enabled in OS.
35 |      * @return bool
36 |      * @throws TextractException
37 |      */
38 |     private function hasOsExtensionInstalled(): bool
39 |     {
40 |         $process = new Process(['pdftotext', '-v']);
41 |         $process->start();
42 |         $process->wait();
43 |         $output = $this->getFilteredOutput($process);
44 |         $has_extension = (bool) preg_match('/pdftotext([\s]+)version/', $output);
45 |         if ($has_extension) {
46 |             return true;
47 |         }
48 |         throw new TextractException(trans('extractor.error_pdf_of_extension_not_installed'));
49 |     }
50 | 
51 |     /**
52 |      * @param Process $process
53 |      * @return string
54 |      */
55 |     private function getFilteredOutput(Process $process): string
56 |     {
57 |         $output = $process->getOutput();
58 |         $output_error = $process->getErrorOutput();
59 |         if (!empty($output)) {
60 |             return $output;
61 |         }
62 |         return $output_error;
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/RtfExtractor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors\PhpWordProcessor;
 6 | 
 7 | class RtfExtractor extends PhpWordProcessor
 8 | {
 9 |     protected string $reader_name = 'RTF';
10 | 
11 |     protected array $supported_mime_types = [
12 |         'application/rtf',
13 |         'text/rtf'
14 |     ];
15 | 
16 |     public array $supported_extension = ['rtf'];
17 | }


--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/TxtExtractor.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Extractors;
 4 | 
 5 | use Nilgems\PhpTextract\ExtractorService\ExtractorCommonProcessors\TextProcessor;
 6 | 
 7 | class TxtExtractor extends TextProcessor
 8 | {
 9 |     protected array $supported_mime_types = [
10 |         'text/plain'
11 |     ];
12 | 
13 |     public array $supported_extension = ['txt'];
14 | }


--------------------------------------------------------------------------------
/src/ExtractorService/Ocr/Contracts/TesseractOcrOptions.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Ocr\Contracts;
 4 | 
 5 | use Illuminate\Contracts\Support\Arrayable;
 6 | 
 7 | class TesseractOcrOptions implements Arrayable
 8 | {
 9 |     protected array $options;
10 | 
11 | 
12 | 
13 |     public function __construct()
14 |     {
15 |         $this->options = [
16 |             'executable' => config('textract.ocr.executable_path', null),
17 |             'tempDir' => config('textract.ocr.temp_dir', null),
18 |             'userWords' => config('textract.ocr.text_dictionary_path'),
19 |             'userPatterns' => config('textract.ocr.text_patterns_path'),
20 |             'lang' => [],
21 |             'allowlist' => [],
22 |             'configVar' => config('textract.ocr.config'),
23 |             'psm' => null,
24 |             'dpi' => null,
25 |             'threadLimit' => config('textract.ocr.thread_limit'),
26 | 
27 |         ];
28 |     }
29 | 
30 |     /**
31 |      * @param string $path
32 |      * @return $this
33 |      */
34 |     public function setTempDir(string $path): self
35 |     {
36 |         $this->options['tempDir'] = $path;
37 |         return $this;
38 |     }
39 | 
40 |     /**
41 |      * Add languages
42 |      * @param array $language
43 |      * @return $this
44 |      */
45 |     public function setLanguage(array $language): self
46 |     {
47 |         $this->options['lang'] = $language;
48 |         return $this;
49 |     }
50 | 
51 |     /**
52 |      * @param int $psm
53 |      * @return $this
54 |      */
55 |     public function setPsm(int $psm): self
56 |     {
57 |         $this->options['psm'] = $psm;
58 |         return $this;
59 |     }
60 | 
61 |     /**
62 |      * @param array $list
63 |      * @return $this
64 |      */
65 |     public function setAllowList(array $list): self
66 |     {
67 |         $this->options['allowlist'] = $list;
68 |         return $this;
69 |     }
70 | 
71 |     public function toArray(): array
72 |     {
73 |         return array_filter($this->options, static function ($option_value) {
74 |             return !empty($option_value);
75 |         });
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/ExtractorService/Ocr/TesseractOcrRun.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\ExtractorService\Ocr;
 4 | 
 5 | use Nilgems\PhpTextract\Exceptions\TextractException;
 6 | use Nilgems\PhpTextract\ExtractorService\Ocr\Contracts\TesseractOcrOptions;
 7 | use Nilgems\PhpTextract\Services\UtilsService;
 8 | use Symfony\Component\Process\Process;
 9 | use thiagoalessio\TesseractOCR\TesseractOCR;
10 | 
11 | class TesseractOcrRun
12 | {
13 |     protected UtilsService $utilsService;
14 | 
15 |     /**
16 |      * @param UtilsService $utilsService
17 |      * @param TesseractOcrOptions|null $ocrOptions
18 |      * @return string
19 |      * @throws TextractException
20 |      * @throws \thiagoalessio\TesseractOCR\TesseractOcrException
21 |      */
22 |     public function boot(UtilsService $utilsService, TesseractOcrOptions $ocrOptions = null): string
23 |     {
24 |         $this->utilsService = $utilsService;
25 |         $is_enabled = config('textract.ocr.enabled', false);
26 |         if ($is_enabled && $this->hasOsExtension() && $this->utilsService->getFilePath()) {
27 |             return $this->getOcr($ocrOptions)->run();
28 |         }
29 |         return "";
30 |     }
31 | 
32 |     protected function getOcr(TesseractOcrOptions $ocrOptions = null): TesseractOCR
33 |     {
34 |         if ($ocrOptions === null) {
35 |             $ocrOptions = new TesseractOcrOptions();
36 |         }
37 |         $ocr = new TesseractOCR($this->utilsService->getFilePath());
38 |         $ocr->withoutTempFiles();
39 |         if ($ocrOptions) {
40 |             foreach ($ocrOptions->toArray() as $option_key => $option_value) {
41 |                 if (is_array($option_value) || is_iterable($option_value)) {
42 |                     $ocr->{$option_key}(...$option_value);
43 |                 } else {
44 |                     $ocr->{$option_key}($option_value);
45 |                 }
46 |             }
47 |         }
48 |         return $ocr;
49 |     }
50 | 
51 |     /**
52 |      * @return bool
53 |      * @throws TextractException
54 |      */
55 |     protected function hasOsExtension(): bool
56 |     {
57 |         $tesseractPath = config('textract.ocr.executable_path', 'tesseract'); // C:\Program Files\Tesseract-OCR\tesseract.exe
58 |         $process = new Process([$tesseractPath, '-v']);
59 |         $process->start();
60 |         $process->wait();
61 |         $output = $this->getConsoleOutput($process);
62 |         $has_installed = (bool) preg_match('/tesseract([\s]+)((v)?[0-9.]+)/', $output);
63 |         if ($has_installed) {
64 |             return true;
65 |         }
66 |         throw new TextractException(trans('textract::tesseract.error_not_installed'));
67 |     }
68 | 
69 |     /**
70 |      * @param Process $process
71 |      * @return string
72 |      */
73 |     protected function getConsoleOutput(Process $process): string
74 |     {
75 |         if ($output = $process->getOutput()) {
76 |             return $output;
77 |         }
78 |         return $process->getErrorOutput();
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/Providers/ServiceProvider.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\Providers;
 4 | 
 5 | use Illuminate\Support\ServiceProvider as IlluminateServiceProvider;
 6 | use Nilgems\PhpTextract\ExtractorService\Extractors\HtmlExtractor;
 7 | use Nilgems\PhpTextract\ExtractorService\Extractors\ImageExtractor;
 8 | use Nilgems\PhpTextract\ExtractorService\Extractors\MsOfficeDocExtractor;
 9 | use Nilgems\PhpTextract\ExtractorService\Extractors\MsOfficeDocxExtractor;
10 | use Nilgems\PhpTextract\ExtractorService\Extractors\MsOfficePptxExtractor;
11 | use Nilgems\PhpTextract\ExtractorService\Extractors\OpenOfficeDocument;
12 | use Nilgems\PhpTextract\ExtractorService\Extractors\OpenOfficeSpreadSheet;
13 | use Nilgems\PhpTextract\ExtractorService\Extractors\PdfExtractor;
14 | use Nilgems\PhpTextract\ExtractorService\Extractors\RtfExtractor;
15 | use Nilgems\PhpTextract\ExtractorService\Extractors\TxtExtractor;
16 | use Nilgems\PhpTextract\Services\ConsoleExtractionService;
17 | use Nilgems\PhpTextract\Services\ExtractService;
18 | use Nilgems\PhpTextract\Services\UtilsService;
19 | 
20 | class ServiceProvider extends IlluminateServiceProvider
21 | {
22 |     public function boot(): void
23 |     {
24 |         $this->publishes([
25 |             __DIR__ . '/../../config/textract.php' => config_path('textract.php')
26 |         ], 'textract');
27 |     }
28 | 
29 |     /**
30 |      * Register services
31 |      * @return void
32 |      */
33 |     public function register(): void
34 |     {
35 |         $this->mergeConfigFrom(__DIR__ . '/../../config/textract.php', 'textract');
36 |         $this->loadTranslationsFrom(__DIR__ . '/../../lang', 'textract');
37 | 
38 |         $this->app->bind(UtilsService::class);
39 |         $this->app->bind('textract', ExtractService::class);
40 |         $this->app->bind(ConsoleExtractionService::class);
41 | 
42 |         $this->registerExtractors();
43 |     }
44 | 
45 |     /**
46 |      * Register extractors to the application
47 |      * @return void
48 |      */
49 |     protected function registerExtractors(): void
50 |     {
51 |         $extractors = [
52 |             HtmlExtractor::class,
53 |             ImageExtractor::class,
54 |             MsOfficeDocExtractor::class,
55 |             MsOfficeDocxExtractor::class,
56 |             MsOfficePptxExtractor::class,
57 |             OpenOfficeDocument::class,
58 |             OpenOfficeSpreadSheet::class,
59 |             PdfExtractor::class,
60 |             RtfExtractor::class,
61 |             TxtExtractor::class
62 |         ];
63 |         foreach ($extractors as $extractor) {
64 |             $this->app->bind($extractor);
65 |         }
66 | 
67 |         $this->app->tag($extractors, 'extractors');
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/Services/ConsoleExtractionService.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\Services;
 4 | 
 5 | use Illuminate\Support\Str;
 6 | use Nilgems\PhpTextract\Concerns\TextractOutput;
 7 | use Nilgems\PhpTextract\Exceptions\TextractException;
 8 | use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractExtractor;
 9 | use Nilgems\PhpTextract\ExtractorService\Ocr\Contracts\TesseractOcrOptions;
10 | 
11 | class ConsoleExtractionService
12 | {
13 |     /**
14 |      * The file path
15 |      * @var string $file_path
16 |      */
17 |     protected string $file_path;
18 |     /**
19 |      * Job id
20 |      * @var string $job_id
21 |      */
22 |     protected string $job_id;
23 |     /**
24 |      * Utility service
25 |      * @var UtilsService $utilsService
26 |      */
27 |     protected UtilsService $utilsService;
28 | 
29 |     /**
30 |      * Run the extractor
31 |      * @param string $file_path
32 |      * @param string|null $job_id
33 |      * @param TesseractOcrOptions|null $ocrOptions
34 |      * @return TextractOutput
35 |      * @throws TextractException
36 |      */
37 |     public function boot(string $file_path, string $job_id = null, TesseractOcrOptions $ocrOptions = null): TextractOutput
38 |     {
39 |         $this->file_path = $file_path;
40 |         $this->job_id = (string) ($job_id ?? Str::uuid());
41 |         $this->utilsService = app(UtilsService::class);
42 |         $this->utilsService->setFilePath($this->file_path);
43 |         $output = $this->utilsService->getExtractor()->boot($this->utilsService);
44 |         return new TextractOutput($output);
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/Services/ExtractService.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract\Services;
 4 | 
 5 | use Illuminate\Support\Str;
 6 | use Nilgems\PhpTextract\Concerns\TextractOutput;
 7 | use Nilgems\PhpTextract\Exceptions\TextractException;
 8 | 
 9 | class ExtractService
10 | {
11 |     protected string $file_path;
12 |     protected string $job_id;
13 | 
14 |     /**
15 |      * @throws TextractException
16 |      */
17 |     public function run(string $file_path, string $job_id = null): TextractOutput
18 |     {
19 |         $this->file_path = $file_path;
20 |         $this->job_id = (string) ($job_id ?? Str::uuid());
21 |         return app(ConsoleExtractionService::class)
22 |             ->boot($this->file_path, $this->job_id);
23 |     }
24 | 
25 |     /**
26 |      * Get file path
27 |      * @return string
28 |      */
29 |     public function getFilePath(): string
30 |     {
31 |         return $this->file_path;
32 |     }
33 | 
34 |     /**
35 |      * Get job id
36 |      * @return string
37 |      */
38 |     public function getJobId(): string
39 |     {
40 |         return $this->job_id;
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/Services/UtilsService.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | namespace Nilgems\PhpTextract\Services;
  4 | 
  5 | use Illuminate\Support\Collection;
  6 | use Nilgems\PhpTextract\Exceptions\TextractException;
  7 | use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractExtractor;
  8 | use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractTextExtractor;
  9 | use Nilgems\PhpTextract\Providers\ServiceProvider;
 10 | 
 11 | class UtilsService
 12 | {
 13 |     /**
 14 |      * Extractable file path
 15 |      * @var string $file_path
 16 |      */
 17 |     protected string $file_path;
 18 |     /**
 19 |      * Extractable file name
 20 |      * @var string $file_name
 21 |      */
 22 |     protected string $file_name;
 23 |     /**
 24 |      * Extractable file extension
 25 |      * @var string $file_extension
 26 |      */
 27 |     protected string $file_extension;
 28 |     /**
 29 |      * Extractable file mime type
 30 |      * @var string $file_mime_type
 31 |      */
 32 |     protected string $file_mime_type;
 33 |     /**
 34 |      * ExtractorService collection
 35 |      * @var Collection $extractor_collection
 36 |      */
 37 |     protected Collection $extractor_collection;
 38 |     /**
 39 |      * Extraction supported file extension
 40 |      * @var array $supported_file_extensions
 41 |      */
 42 |     protected array $supported_file_extensions = [];
 43 | 
 44 |     /**
 45 |      * Set file path
 46 |      * @param string $file_path
 47 |      * @return $this
 48 |      */
 49 |     public function setFilePath(string $file_path): self
 50 |     {
 51 |         $this->file_path = $file_path;
 52 |         $this->file_name = $this->getFileName();
 53 |         $this->file_extension = $this->getFileExtension();
 54 |         $this->file_mime_type = $this->getFileMimeType();
 55 |         $this->extractor_collection = collect(app()->tagged('extractors'));
 56 |         $this->supported_file_extensions = (clone $this->extractor_collection)
 57 |             ->transform(function (AbstractTextExtractor $extractor) {
 58 |                 return $extractor->supported_extension;
 59 |             })
 60 |             ->flatten()
 61 |             ->toArray();
 62 |         return $this;
 63 |     }
 64 | 
 65 |     /**
 66 |      * Get the file path
 67 |      * @return string
 68 |      * @throws TextractException
 69 |      */
 70 |     public function getFilePath(): string
 71 |     {
 72 |         if ($this->fileIsExists()) {
 73 |             return $this->file_path;
 74 |         }
 75 |         throw new TextractException(trans('textract::file.error_not_exists', ['path' => $this->file_path]));
 76 |     }
 77 | 
 78 |     /**
 79 |      * Is the extractable file exists/file path is valid or not
 80 |      * @return bool
 81 |      */
 82 |     protected function fileIsExists(): bool
 83 |     {
 84 |         if (isset($this->file_path)) {
 85 |             return file_exists($this->file_path);
 86 |         }
 87 |         return false;
 88 |     }
 89 | 
 90 |     /**
 91 |      * Get the extractor
 92 |      * @return AbstractTextExtractor
 93 |      * @throws TextractException
 94 |      */
 95 |     public function getExtractor(): AbstractTextExtractor
 96 |     {
 97 |         if (isset($this->file_mime_type)) {
 98 |             $selected_extractor =  (clone $this->extractor_collection)
 99 |                 ->filter(function (AbstractTextExtractor $extractor) {
100 |                     return in_array($this->file_extension, $extractor->supported_extension, true);
101 |                 });
102 |             if ($selected_extractor->count() > 0) {
103 |                 return $selected_extractor->first();
104 |             }
105 |             throw new TextractException(
106 |                 "Invalid file format. Only support ".
107 |                 implode('/', $this->supported_file_extensions).
108 |                 " files"
109 |             );
110 |         }
111 |         throw new TextractException("Please provide a file to extract text from that.");
112 |     }
113 | 
114 |     /**
115 |      * Get the file name from the file path
116 |      * @return string|null
117 |      */
118 |     public function getFileName(): ?string
119 |     {
120 |         if (isset($this->file_path)) {
121 |             return basename($this->file_path);
122 |         }
123 |         return null;
124 |     }
125 | 
126 |     /**
127 |      * Get file extension from the file path
128 |      * @return string|null
129 |      */
130 |     public function getFileExtension(): ?string
131 |     {
132 |         if (isset($this->file_path)) {
133 |             return strtolower(pathinfo($this->file_path, PATHINFO_EXTENSION));
134 |         }
135 |         return null;
136 |     }
137 | 
138 |     /**
139 |      * Get file mime type from the file
140 |      * @return string|null
141 |      */
142 |     public function getFileMimeType(): ?string
143 |     {
144 |         if (isset($this->file_path)) {
145 |             return mime_content_type($this->file_path);
146 |         }
147 |         return null;
148 |     }
149 | }
150 | 


--------------------------------------------------------------------------------
/src/Textract.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | namespace Nilgems\PhpTextract;
 4 | 
 5 | use Illuminate\Support\Facades\Facade;
 6 | 
 7 | /**
 8 |  * @method static \Nilgems\PhpTextract\Concerns\TextractOutput run(string $file_path, string $job_id=null, array $data = [])
 9 |  */
10 | class Textract extends Facade
11 | {
12 |     protected static function getFacadeAccessor(): string
13 |     {
14 |         return 'textract';
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/storage/example-multi-languages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/example-multi-languages.png


--------------------------------------------------------------------------------
/storage/example.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/example.doc


--------------------------------------------------------------------------------
/storage/example.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/example.docx


--------------------------------------------------------------------------------
/storage/example.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/example.epub


--------------------------------------------------------------------------------
/storage/example.ods:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/example.ods


--------------------------------------------------------------------------------
/storage/example.odt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/example.odt


--------------------------------------------------------------------------------
/storage/example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/example.pdf


--------------------------------------------------------------------------------
/storage/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/example.png


--------------------------------------------------------------------------------
/storage/example.txt:
--------------------------------------------------------------------------------
 1 | NAME
 2 | ADDRESS
 3 | EMAIL
 4 | support@codedoctoe.co.in CONTACT
 5 | SL. NO QTY AMOUNT
 6 | 1 8,000.00
 7 | 2 1 10,000.00
 8 | 3 1 2,000.00
 9 | 4 2,000.00
10 | 5 35 hr 3,500.00
11 | 6 1 9,551.62
12 | BANK & PAYMENT DETAILS
13 | THANK YOU!
14 | 35,051.62
15 | 15,000.00
16 | 20,051.62
17 | SUB TOTAL
18 | 8981506503
19 | FOR UPI PAYMENT
20 | BHIM
21 | GPAY
22 | FOR APP PAYMENT
23 | 8981506503@UPI
24 | PAID
25 | DUE
26 | Reference No.: IMPS211111335169
27 | Date: 21 Apr 2022
28 | New feature : Specific time period for exam link
29 | Maintenance (100/hr)
30 | Hostgator server (21/04/2022)
31 | DESCRIPTION
32 | E-mail and sms configuration
33 | Nayak sir's Website Design and Development
34 | Nayak sir's Server (for 1 year | 01/04/22 - 31/03/23 )
35 | WhatsApp : (+91)8777618481
36 | www.codedoctor.co.in
37 | 22/05/2022
38 | UNIT PRICE
39 | BANK NAME STATE BANK OF INDIA
40 | SBIN0001490
41 | 32386775028
42 | Nabyendu Kuiti
43 | ** N.B: This is a computer-generated document. No signature is required.
44 | A/C HOLDER NAME :
45 | A/C NUMBER
46 | IFSC CODE
47 | FOR BANK PAYMENT
48 | Kolkata
49 | karticksaho@gmail.com
50 | (+91)9143067223
51 | INVOICE
52 | INVOICE DETAILS
53 | INVOICE NO.:
54 | INVOICE DATE :
55 | INVOICE DUE DATE :
56 | BARUIPUR,KOLKATA 700144
57 | CODE DOCTOR
58 | WEST BENGAL
59 | Kartick Saho
60 | BILLED BY BILLED TO
61 | 


--------------------------------------------------------------------------------
/storage/example.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/example.xls


--------------------------------------------------------------------------------
/storage/example.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/example.xlsx


--------------------------------------------------------------------------------
/storage/exmple-mix-ben.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/storage/exmple-mix-ben.pdf


--------------------------------------------------------------------------------
/tests/ExtractionTest.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | use Nilgems\PhpTextract\Services\ExtractService;
 4 | use Nilgems\PhpTextract\Textract;
 5 | use PHPUnit\Framework\TestCase;
 6 | 
 7 | class ExtractionTest extends TestCase
 8 | {
 9 |     /**
10 |      * @dataProvider addExtractionData
11 |      */
12 |     public function testExtraction(string $path)
13 |     {
14 |         $output = (new ExtractService())->run($path);
15 |         $this->assertIsInt($output->word_count);
16 |         $this->assertIsString($output->text);
17 |         $this->assertNotEmpty($output->text);
18 |     }
19 | 
20 |     public function addExtractionData(): array
21 |     {
22 |         return [
23 |             'extracting doc' => [__DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'storage/example.xlsx']
24 |         ];
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------