├── .gitattributes
├── .gitignore
├── .php-cs-fixer.cache
├── LICENSE.md
├── README.md
├── _config.yml
├── blobs
├── danger.png
├── sticky-notes.png
├── ubuntu.png
├── warning.png
└── windows.png
├── composer.json
├── composer.lock
├── config
└── textract.php
├── index.md
├── lang
└── en
│ ├── extractor.php
│ ├── file.php
│ ├── processor.php
│ └── tesseract.php
├── src
├── Concerns
│ └── TextractOutput.php
├── Exceptions
│ └── TextractException.php
├── ExtractorService
│ ├── Contracts
│ │ ├── AbstractExtractor.php
│ │ ├── AbstractTextExtractor.php
│ │ ├── HasPhpWord.php
│ │ └── TextProcessorHaveFilter.php
│ ├── ExtractorCommonProcessors
│ │ ├── PhpPowerPointProcessor.php
│ │ ├── PhpSheetProcessor.php
│ │ ├── PhpWordProcessor.php
│ │ └── TextProcessor.php
│ ├── Extractors
│ │ ├── HtmlExtractor.php
│ │ ├── ImageExtractor.php
│ │ ├── MsOfficeDocExtractor.php
│ │ ├── MsOfficeDocxExtractor.php
│ │ ├── MsOfficeExcelExtractor.php
│ │ ├── MsOfficePptxExtractor.php
│ │ ├── OpenOfficeDocument.php
│ │ ├── OpenOfficeSpreadSheet.php
│ │ ├── PdfExtractor.php
│ │ ├── RtfExtractor.php
│ │ └── TxtExtractor.php
│ └── Ocr
│ │ ├── Contracts
│ │ └── TesseractOcrOptions.php
│ │ └── TesseractOcrRun.php
├── Providers
│ └── ServiceProvider.php
├── Services
│ ├── ConsoleExtractionService.php
│ ├── ExtractService.php
│ └── UtilsService.php
└── Textract.php
├── storage
├── example-multi-languages.png
├── example.doc
├── example.docx
├── example.epub
├── example.ods
├── example.odt
├── example.pdf
├── example.png
├── example.rtf
├── example.txt
├── example.xls
├── example.xlsx
└── exmple-mix-ben.pdf
└── tests
└── ExtractionTest.php
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.doc linguist-detectable=false
2 | *.docx linguist-detectable=false
3 | *.ods linguist-detectable=false
4 | *.odt linguist-detectable=false
5 | *.pdf linguist-detectable=false
6 | *.png linguist-detectable=false
7 | *.rtf linguist-detectable=false
8 | *.txt linguist-detectable=false
9 | *.xls linguist-detectable=false
10 | *.xlsx linguist-detectable=false
11 | *.php linguist-detectable=true
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Composer template
2 | composer.phar
3 | .idea/*
4 | /vendor/
5 |
6 | # Commit your application's lock file https://getcomposer.org/doc/01-basic-usage.md#commit-your-composer-lock-file-to-version-control
7 | # You may choose to ignore a library lock file http://getcomposer.org/doc/02-libraries.md#lock-file
8 | # composer.lock
9 |
10 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright 2022 | Niladri Shekhar Mondal
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 |
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 |
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://packagist.org/packages/nilgems/laravel-textract)
2 | # Laravel Textract
3 | A [Laravel](https://laravel.com) package to extract text from files like DOC, Excel, Image, Pdf and more.
4 |
5 | # Versions and compatibility
6 |
7 | - [Laravel 10](https://laravel.com) or higher is required.
8 | - [Php 8.2]() or higher is required
9 |
10 | ### Supported file formats
11 | Following file formats is supported currently. You need to install proper extensions
12 | to your server to work with all the following extension related files. The package will
13 | check file content MIME type before execute.
14 | - **HTML**
15 | - **TEXT**
16 | - **DOC**
17 | - **DOCX**
18 | - **XLS**, **XLSX**, **XLSM**, **XLTX**, **XLTM**, **XLT**
19 | - **CSV**
20 | - **PDF**
21 | - **Image**
22 | - _jpeg_
23 | - _png_
24 | - _gif_
25 | - **ODT**
26 | - **ODS**
27 | - **RTF**
28 | - **PPTX** (NEW)
29 |
30 | **We are working hard to make this laravel plugin useful. If you found any issue please add a post on discussion.**
31 |
32 | ### Installation
33 |
34 | ```
35 | composer require nilgems/laravel-textract
36 | ```
37 | Once installed you can do stuff like this:
38 | ```
39 | # Run the extractor
40 | $output = Textract::run('/path/to/file.extension');
41 |
42 | # Display the extracted text
43 | echo $output->text;
44 |
45 | # Display the extracted text word count
46 | echo $output->word_count;
47 |
48 | # Display the extracted text with direct string conversion
49 | echo (string) $output;
50 | ```
51 | Run the extractor to any supported file:
52 | ```
53 | Textract::run(string $file_path, [string $job_id],[TesseractOcrOptions $extra_data]);
54 | ```
55 | | Option | Type | Default value | Required | Description |
56 | |:-----------:|:-------------------------:|:------------------:|:--------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
57 | | $file_path | ```String``` | _No default value_ | Yes | Text extractable file absolute path. |
58 | | $job_id | ```String``` | ```NULL``` | No | It's a optional parameter. Extraction **job id**. If this option is blank the plugin will auto create the **ID** |
59 | | $extra_data | ```TesseractOcrOptions``` | ```NULL``` | No | It's a optional parameter. To pass extra parameter. If you are extracting a image file, you can mention languages and more by this ```Nilgems\PhpTextract\ExtractorService\Ocr\Contracts\TesseractOcrOptions``` **parameter**. |
60 |
61 | ### Configuration
62 |
63 | - You can add **provider** in ```app.php``` under the ```config``` folder of your
64 | [Laravel](https://laravel.com) project. It's optional, the package automatically load the service provider in your application.
65 | ```
66 | 'providers' => [
67 | ...
68 | Nilgems\PhpTextract\Providers\ServiceProvider,
69 | ...
70 | ]
71 | ```
72 | - Add **alias** in ```app.php``` under the ```config``` folder of your
73 | [Laravel](https://laravel.com) project. It's optional, the package automatically load the ```facade``` in your application.
74 | ```
75 | 'aliases' => [
76 | ...
77 | 'Textract' => Nilgems\PhpTextract\Textract::class,
78 | ...
79 | ]
80 | ```
81 | - To publish the ```config``` file, run:
82 | ```
83 | php artisan vendor:publish --tag=textract
84 | ```
85 | ### Example
86 |
87 | ##### Example 1:
88 | You can extract text from supported file format.
89 |
90 | It is recommended to use the extractor with [Laravel Queue Job](https://laravel.com/docs/9.x/queues#creating-jobs) from better performance.
91 | In ```php``` there have a restriction of execution time and memory limit defined in ```php.ini``` file with the option ```max_execution_time``` and ```memory_limit```. If file size is big, the process may kill forcefully when exceed the limit. You can use ```queue - database/redis``` or ```Laravel horizon``` to run the process in background.
92 | ```
93 | ........
94 | Route::get('/textract', function(){
95 | return Textract::run('/path/to/image/example.png');
96 | });
97 | ........
98 | ```
99 |
100 | ##### Example 2:
101 | If you need to specify languages in image file for better extraction output from image file.
102 | ```
103 | ........
104 | Route::get('/textract', function(){
105 | return Textract::run('/path/to/image/example.png', null, [
106 | 'lang' => ['eng', 'jpn', 'spa']
107 | ]);
108 | });
109 | ........
110 | ```
111 | ### Dependencies
112 | - To enable the image extraction feature you need to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
113 | - To enable the PDF extraction feature you need to install [pdftotext](http://www.xpdfreader.com/download.html)
114 | - To work properly, your server must have following php extensions installed -
115 | - **ext-fileinfo**
116 | - **ext-zip**
117 | - **ext-gd** or **ext-imagick**
118 | - **ext-xml**
119 | ### Tesseract OCR Installation
120 | ####
Ubuntu
121 | - Update the system: ```sudo apt update```
122 | - Add Tesseract OCR 5 PPA to your system: ```sudo add-apt-repository ppa:alex-p/tesseract-ocr-devel```
123 | - Install Tesseract on Ubuntu 20.04 | 18.04: ```sudo apt install -y tesseract-ocr```
124 | - Once installation is complete update your system: ```sudo apt update```
125 | - Verify the installation: ```tesseract --version```
126 | ####
Windows
127 | - There are many [ways](https://github.com/tesseract-ocr/tesseract/wiki#windows) to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) on your system, but if you just want something quick to get up and running, I recommend installing the [Capture2Text](https://chocolatey.org/packages/capture2text) package with [Chocolatey](https://chocolatey.org/).
128 | - Choco installation: ```choco install capture2text --version 5.0```
129 |
130 | **Note: Recent versions of [Capture2Text](https://chocolatey.org/packages/capture2text) stopped shipping the ```tesseract``` binary**
131 |
132 | ### PdfToText Installation
133 | ####
Ubuntu
134 | - Update the system: ```sudo apt update```
135 | - Install PdfToText on Ubuntu 20.04 | 18.04: ```sudo apt-get install poppler-utils```
136 | - Verify the installation: ```pdftotext -v```
137 | ####
Windows
138 | - Sorry but ```pdftotext``` available via [poppler](https://poppler.freedesktop.org/) and the [poppler](https://poppler.freedesktop.org/) is not available yet for windows. But you can install and [use the library by windows linux sub-system WLS](https://towardsdatascience.com/poppler-on-windows-179af0e50150). Alternatively, you can install [Laravel Homestead](https://laravel.com/docs/9.x/homestead) in your project and using vagrant virtualization you can run the project in ubuntu virtual server.
139 |
140 | ## License
141 |
142 | [MIT](https://choosealicense.com/licenses/mit/)
143 |
144 | ---
145 | ## 💻 Tech Stack
146 |                     
147 |
148 | ---
149 | [](https://visitcount.itsvg.in)
150 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-merlot
--------------------------------------------------------------------------------
/blobs/danger.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/danger.png
--------------------------------------------------------------------------------
/blobs/sticky-notes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/sticky-notes.png
--------------------------------------------------------------------------------
/blobs/ubuntu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/ubuntu.png
--------------------------------------------------------------------------------
/blobs/warning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/warning.png
--------------------------------------------------------------------------------
/blobs/windows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NilGems/laravel-textract/13f6db84e57eea61a32314408a99a2ce6b359cc5/blobs/windows.png
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "nilgems/laravel-textract",
3 | "description": "A Laravel package to extract text from files like DOC, XL, Image, Pdf and more. I've developed this package by inspiring \"npm textract\".",
4 | "type": "library",
5 | "keywords": [
6 | "laravel",
7 | "plugin",
8 | "package",
9 | "text",
10 | "textract",
11 | "extract",
12 | "html",
13 | "csv",
14 | "text",
15 | "pdf",
16 | "docx",
17 | "doc",
18 | "xls",
19 | "xlsx",
20 | "png",
21 | "jpg",
22 | "rtf",
23 | "xml",
24 | "odt",
25 | "ott",
26 | "xlsb",
27 | "xlsm",
28 | "xltx",
29 | "ods"
30 | ],
31 | "require": {
32 | "php": "^8.2",
33 | "ext-fileinfo": "*",
34 | "ext-zip": "*",
35 | "ext-xml": "*",
36 | "ext-gd": "*",
37 | "symfony/process": "^6.4.3",
38 | "phpoffice/phpspreadsheet": "^1.23",
39 | "phpoffice/phpword": "^0.18",
40 | "laravel/framework": "^12.0",
41 | "thiagoalessio/tesseract_ocr": "^2.12",
42 | "html2text/html2text": "^4.3",
43 | "phpoffice/phppresentation": "^1.0"
44 | },
45 | "require-dev": {
46 | "phpunit/phpunit": "^9.5"
47 | },
48 | "license": "MIT",
49 | "autoload": {
50 | "psr-4": {
51 | "Nilgems\\PhpTextract\\": "src/"
52 | }
53 | },
54 | "extra": {
55 | "laravel": {
56 | "providers": [
57 | "Nilgems\\PhpTextract\\Providers\\ServiceProvider"
58 | ],
59 | "aliases": {
60 | "Textract":"Nilgems\\PhpTextract\\Textract"
61 | }
62 | }
63 | },
64 | "authors": [
65 | {
66 | "name": "Niladri Shekhar Mondal",
67 | "email": "nldrmondal35@gmail.com"
68 | }
69 | ],
70 | "minimum-stability": "stable"
71 | }
72 |
--------------------------------------------------------------------------------
/config/textract.php:
--------------------------------------------------------------------------------
1 | [
17 | /*
18 | | -------------------------------------------------------------------------------------------------------------
19 | | OCR enabled or disabled:
20 | | -------------------------------------------------------------------------------------------------------------
21 | |
22 | |
23 | | Enable or disable the OCR functionality here. By default, the OCR is enabled and the code will check the plugin
24 | | is already installed or not in your server before do any operation. If the plugin is not installed/disabled the image
25 | | file extraction will not work.
26 | |
27 | */
28 | 'enabled' => env('TEXTRACT_OCR_ENABLED', true),
29 | /*
30 | | -------------------------------------------------------------------------------------------------------------
31 | | OCR custom executable path
32 | | -------------------------------------------------------------------------------------------------------------
33 | |
34 | | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#executable
35 | |
36 | */
37 | 'executable_path' => env('TEXTRACT_OCR_EXEC_PATH', null),
38 |
39 | /*
40 | | -------------------------------------------------------------------------------------------------------------
41 | | OCR inducing recognition
42 | | -------------------------------------------------------------------------------------------------------------
43 | |
44 | |
45 | | By default, the value is 'null' and OCR will automatically recognise the text and try to extract whole text.
46 | | If you defined the path, the OCR will be able to extract those text that will match with the patterns inside
47 | | the text file.
48 | |
49 | |
50 | | Pattern example you can write inside the text file:
51 | | 1-\d\d\d-GOOG-441
52 | | www.\n\\\*.com
53 | |
54 | | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#inducing-recognition
55 | |
56 | */
57 | 'text_patterns_path' => env('TEXTRACT_OCR_TEXT_PATTERNS_PATH', null),
58 |
59 | /*
60 | | -------------------------------------------------------------------------------------------------------------
61 | | OCR thread limit
62 | | -------------------------------------------------------------------------------------------------------------
63 | |
64 | |
65 | | The value of limit will be a integer value. 0 - Mean all available thread.
66 | | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#thread-limit
67 | |
68 | */
69 | 'thread_limit' => env('TEXTRACT_OCR_THREAD_LIMIT', 0),
70 |
71 | /*
72 | | -------------------------------------------------------------------------------------------------------------
73 | | OCR custom dictionary text file path.
74 | | -------------------------------------------------------------------------------------------------------------
75 | |
76 | |
77 | | By default, the value is 'null'
78 | | Fore more details pleases visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#userpatterns
79 | |
80 | */
81 | 'text_dictionary_path' => env('TEXTRACT_OCR_TEXT_DICTIONARY_PATH', null),
82 |
83 | /*
84 | |-------------------------------------------------------------------------------------------------------------
85 | | OCR other custom configurations
86 | |-------------------------------------------------------------------------------------------------------------
87 | |
88 | |
89 | | For more details please visit - https://github.com/thiagoalessio/tesseract-ocr-for-php#other-options
90 | |
91 | */
92 | 'config' => [],
93 |
94 | /*
95 | |-------------------------------------------------------------------------------------------------------------
96 | | OCR Temporary file storage directory
97 | |-------------------------------------------------------------------------------------------------------------
98 | |
99 | | OCR custom temporary folder storage path. Make sure the path have proper permissions to access by PHP.
100 | */
101 | 'temp_dir' => null
102 | ]
103 |
104 | ];
105 |
--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
1 | [](https://packagist.org/packages/nilgems/laravel-textract)
2 | # Laravel Textract
3 | A [Laravel](https://laravel.com) package to extract text from files like DOC, Excel, Image, Pdf and more.
4 |
5 | # Versions and compatibility
6 |
7 | - [Laravel 8](https://laravel.com) or higher is required.
8 | - [Php 7.4]() or higher is required
9 |
10 | ###
[Laravel 9](https://laravel.com) support is added.
11 |
12 | ### Supported file formats
13 | Following file formats is supported currently. You need to install proper extensions
14 | to your server to work with all the following extension related files. The package will
15 | check file content MIME type before execute.
16 | - **HTML**
17 | - **TEXT**
18 | - **DOC**
19 | - **DOCX**
20 | - **XLS**, **XLSX**, **XLSM**, **XLTX**, **XLTM**, **XLT**
21 | - **CSV**
22 | - **PDF**
23 | - **Image**
24 | - _jpeg_
25 | - _png_
26 | - **ODT**
27 | - **ODS**
28 | - **RTF**
29 |
30 |
***GIF*** and ***PPT*** support is under development.
31 |
32 | **We are working hard to make this laravel plugin useful. If you found any issue please add a post on discussion.**
33 |
34 | ### Installation
35 |
36 | ```
37 | composer require nilgems/laravel-textract
38 | ```
39 | Once installed you can do stuff like this:
40 | ```
41 | # Run the extractor
42 | $output = Textract::run('/path/to/file.extension');
43 |
44 | # Display the extracted text
45 | echo $output->text;
46 |
47 | # Display the extracted text word count
48 | echo $output->word_count;
49 |
50 | # Display the extracted text with direct string conversion
51 | echo (string) $output;
52 | ```
53 | Run the extractor to any supported file:
54 | ```
55 | Textract::run(string $file_path, [string $job_id],[array $extra_data]);
56 | ```
57 | | Option | Type | Default value | Required | Description |
58 | |:-----------:|:------:|:------------------:|:--------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|
59 | | $file_path | String | _No default value_ | Yes | Text extractable file absolute path. |
60 | | $job_id | String | ```NULL``` | No | It's a optional parameter. Extraction **job id**. If this option is blank the plugin will auto create the **ID** |
61 | | $extra_data | array | [] | No | It's a optional parameter. To pass extra parameter. If you are extracting a image file, you can mention languages by this **parameter**. ``` ['lang' => ['eng', 'jpn', 'spa']] ``` |
62 |
63 | ### Configuration
64 |
65 | - You can add **provider** in ```app.php``` under the ```config``` folder of your
66 | [Laravel](https://laravel.com) project. It's optional, the package automatically load the service provider in your application.
67 | ```
68 | 'providers' => [
69 | ...
70 | Nilgems\PhpTextract\Providers\ServiceProvider,
71 | ...
72 | ]
73 | ```
74 | - Add **alias** in ```app.php``` under the ```config``` folder of your
75 | [Laravel](https://laravel.com) project. It's optional, the package automatically load the ```facade``` in your application.
76 | ```
77 | 'aliases' => [
78 | ...
79 | 'Textract' => Nilgems\PhpTextract\Textract::class,
80 | ...
81 | ]
82 | ```
83 | ### Example
84 |
85 | ##### Example 1:
86 | You can extract text from supported file format.
87 |
88 | It is recommended to use the extractor with [Laravel Queue Job](https://laravel.com/docs/9.x/queues#creating-jobs) from better performance.
89 | In ```php``` there have a restriction of execution time and memory limit defined in ```php.ini``` file with the option ```max_execution_time``` and ```memory_limit```. If file size is big, the process may kill forcefully when exceed the limit. You can use ```queue - database/redis``` or ```Laravel horizon``` to run the process in background.
90 | ```
91 | ........
92 | Route::get('/textract', function(){
93 | return Textract::run('/path/to/image/example.png');
94 | });
95 | ........
96 | ```
97 |
98 | ##### Example 2:
99 | If you need to specify languages in image file for better extraction output from image file.
100 | ```
101 | ........
102 | Route::get('/textract', function(){
103 | return Textract::run('/path/to/image/example.png', null, [
104 | 'lang' => ['eng', 'jpn', 'spa']
105 | ]);
106 | });
107 | ........
108 | ```
109 | ### Dependencies
110 | - To enable the image extraction feature you need to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
111 | - To enable the PDF extraction feature you need to install [pdftotext](http://www.xpdfreader.com/download.html)
112 | - To work properly, your server must have following php extensions installed -
113 | - **ext-fileinfo**
114 | - **ext-zip**
115 | - **ext-gd** or **ext-imagick**
116 | - **ext-xml**
117 | ### Tesseract OCR Installation
118 | ####
Ubuntu
119 | - Update the system: ```sudo apt update```
120 | - Add Tesseract OCR 5 PPA to your system: ```sudo add-apt-repository ppa:alex-p/tesseract-ocr-devel```
121 | - Install Tesseract on Ubuntu 20.04 | 18.04: ```sudo apt install -y tesseract-ocr```
122 | - Once installation is complete update your system: ```sudo apt update```
123 | - Verify the installation: ```tesseract --version```
124 | ####
Windows
125 | - There are many [ways](https://github.com/tesseract-ocr/tesseract/wiki#windows) to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) on your system, but if you just want something quick to get up and running, I recommend installing the [Capture2Text](https://chocolatey.org/packages/capture2text) package with [Chocolatey](https://chocolatey.org/).
126 | - Choco installation: ```choco install capture2text --version 5.0```
127 |
128 | **Note: Recent versions of [Capture2Text](https://chocolatey.org/packages/capture2text) stopped shipping the ```tesseract``` binary**
129 |
130 | ### PdfToText Installation
131 | ####
Ubuntu
132 | - Update the system: ```sudo apt update```
133 | - Install PdfToText on Ubuntu 20.04 | 18.04: ```sudo apt-get install poppler-utils```
134 | - Verify the installation: ```pdftotext -v```
135 | ####
Windows
136 | - Sorry but ```pdftotext``` available via [poppler](https://poppler.freedesktop.org/) and the [poppler](https://poppler.freedesktop.org/) is not available yet for windows. But you can install and [use the library by windows linux sub-system WLS](https://towardsdatascience.com/poppler-on-windows-179af0e50150). Alternatively, you can install [Laravel Homestead](https://laravel.com/docs/9.x/homestead) in your project and using vagrant virtualization you can run the project in ubuntu virtual server.
137 |
138 | ## License
139 |
140 | [MIT](https://choosealicense.com/licenses/mit/)
141 |
142 | ---
143 | ## 💻 Tech Stack
144 |                     
145 |
146 | ---
147 | [](https://visitcount.itsvg.in)
148 |
--------------------------------------------------------------------------------
/lang/en/extractor.php:
--------------------------------------------------------------------------------
1 | 'The content of file in {path} path with extension .{extension} is|are match with MIME type {mime_types}',
4 | 'error_supported_extension_not_defined' => 'Supported extension is not defined in extractor.',
5 | 'error_pdf_of_extension_not_installed' => '\'pdftotext\' does not appear to be installed. Please check the documentation - https://github.com/NilGems/laravel-textract#pdftotext-installation'
6 | ];
7 |
--------------------------------------------------------------------------------
/lang/en/file.php:
--------------------------------------------------------------------------------
1 | 'The provided file path is invalid, any file is not available in {path} path. Please can check the file have proper permission or exists.'
4 | ];
5 |
--------------------------------------------------------------------------------
/lang/en/processor.php:
--------------------------------------------------------------------------------
1 | 'The provided file of {path} path is not readable.'
4 | ];
5 |
--------------------------------------------------------------------------------
/lang/en/tesseract.php:
--------------------------------------------------------------------------------
1 | '\'tesseract\' does not appear to be installed. Please check the document - https://github.com/NilGems/laravel-textract#tesseract-ocr-installation',
4 | 'error_file_extension_txt_required' => 'Only \'text\' file is supported. Please provide file with .txt extension.',
5 | 'error_input_invalid' => 'Invalid value is provided'
6 | ];
--------------------------------------------------------------------------------
/src/Concerns/TextractOutput.php:
--------------------------------------------------------------------------------
1 | collection = new Collection([
22 | 'text' => htmlspecialchars($raw_output, ENT_NOQUOTES, "UTF-8"),
23 | 'word_count' => str_word_count(utf8_decode($raw_output), 0)
24 | ]);
25 | }
26 |
27 | /**
28 | * To array
29 | * @return array
30 | */
31 | public function toArray(): array
32 | {
33 | return $this->collection->toArray();
34 | }
35 |
36 | public function __get(string $key)
37 | {
38 | return $this->collection->get($key);
39 | }
40 |
41 | public function __set(string $key, string $value)
42 | {
43 | $this->collection->put($key, $value);
44 | }
45 |
46 | public function __isset(string $key)
47 | {
48 | return $this->collection->has($key);
49 | }
50 |
51 | /**
52 | * To string
53 | * @return string
54 | */
55 | public function __toString(): string
56 | {
57 | return $this->collection->get('text');
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/Exceptions/TextractException.php:
--------------------------------------------------------------------------------
1 | data = new Collection([]);
42 | }
43 |
44 | /**
45 | * Set data
46 | * @param $key
47 | * @param $value
48 | * @return $this
49 | */
50 | public function setData($key, $value): self
51 | {
52 | $this->data->put($key, $value);
53 | return $this;
54 | }
55 | /**
56 | * Get accept mime types
57 | * @return array
58 | */
59 | public function getAcceptMimeTypes(): array
60 | {
61 | if (method_exists($this, 'mimeAccepts')) {
62 | return $this->mimeAccepts();
63 | }
64 | return $this->mime_accepts;
65 | }
66 |
67 | /**
68 | * Get acceptable extensions
69 | * @return array
70 | */
71 | public function getAcceptExtensions(): array
72 | {
73 | return $this->extractor_supported_extension;
74 | }
75 | /**
76 | * Has match mime type
77 | * @param string $mime_type
78 | * @return bool
79 | */
80 | public function hasMatchMimeType(string $mime_type): bool
81 | {
82 | $acceptable_mime_type = $this->getAcceptMimeTypes();
83 | if (empty($acceptable_mime_type)) {
84 | return true;
85 | }
86 | return in_array(strtolower($mime_type), $acceptable_mime_type, true);
87 | }
88 |
89 | /**
90 | * @param string $file_path
91 | * @param array $data
92 | * @return string|null
93 | * @throws TextractException
94 | */
95 | public function boot(string $file_path, array $data = []): ?string
96 | {
97 | $this->file_path = $file_path;
98 | $this->data = $this->data->merge($data);
99 | $utilsService = app(UtilsService::class)->setFilePath($file_path);
100 | $utilsService->setFilePath($file_path);
101 | $this->current_mime_type = $utilsService->getFileMimeType();
102 | if (!$this->hasMatchMimeType($this->current_mime_type)) {
103 | throw new TextractException(
104 | $this->extractor_name .
105 | ' unable to process the file. Please ensure the content of file is a ' .
106 | implode('/', $this->extractor_supported_extension) . 'file.'
107 | );
108 | }
109 | $has_valid = $this->checkHaveProviderPackage();
110 | if ($has_valid) {
111 | return $this->getTextFromFile();
112 | }
113 | throw new TextractException($this->error_message);
114 | }
115 |
116 | abstract protected function checkHaveProviderPackage();
117 |
118 | abstract protected function getTextFromFile();
119 | }
120 |
--------------------------------------------------------------------------------
/src/ExtractorService/Contracts/AbstractTextExtractor.php:
--------------------------------------------------------------------------------
1 | utilsService = $utilsService;
36 | if ($this->hasSupportedExtensionDefined() && $this->utilsService->getFilePath() && $this->hasMatchMimeType()) {
37 | return $this->getExtractedText();
38 | }
39 | return "";
40 | }
41 |
42 | /**
43 | * Check the supported file format is defined or not.
44 | * @return bool
45 | * @throws TextractException
46 | */
47 | private function hasSupportedExtensionDefined(): bool
48 | {
49 | if (!empty($this->supported_extension)) {
50 | return true;
51 | }
52 | throw new TextractException(trans('textract::extractor.error_supported_extension_not_defined'));
53 | }
54 |
55 | /**
56 | * Check the mime type of file provided via path is match or not
57 | * @return bool
58 | * @throws TextractException
59 | */
60 | private function hasMatchMimeType(): bool
61 | {
62 | $current_file_mime_type = strtolower($this->utilsService->getFileMimeType());
63 | $is_match_mime_type = collect($this->supported_mime_types)
64 | ->transform(function ($mime_type) {
65 | return strtolower($mime_type);
66 | })
67 | ->filter(function ($mime_type) use ($current_file_mime_type) {
68 | return Str::of($mime_type)->exactly($current_file_mime_type);
69 | })
70 | ->count() > 0;
71 | if (!$is_match_mime_type) {
72 | throw new TextractException(trans_choice('textract::extractor.error_mime_mismatch', count($this->supported_extension), [
73 | 'path' => $this->utilsService->getFilePath(),
74 | 'extension' => implode(', .', $this->supported_extension),
75 | 'mime_types' => implode(', ', $this->supported_mime_types)
76 | ]));
77 | }
78 | return true;
79 | }
80 |
81 | abstract protected function getExtractedText(): string;
82 | }
83 |
--------------------------------------------------------------------------------
/src/ExtractorService/Contracts/HasPhpWord.php:
--------------------------------------------------------------------------------
1 | getSections() as $section) {
21 | $elements = $section->getElements();
22 | $data = [...$data, ...$this->getElementText($elements)];
23 | }
24 | return implode(" ", array_filter($data));
25 | }
26 |
27 | /**
28 | * @param array $elements
29 | * @return array
30 | */
31 | protected function getElementText(array $elements): array
32 | {
33 | $docs = [];
34 | foreach ($elements as $element) {
35 | if ($element instanceof PhpWordElementText) {
36 | $docs[] = trim($element->getText());
37 | }
38 | if ($element instanceof PhpWordElementTextRun) {
39 | $nested_data = $this->getElementText($element->getElements());
40 | $docs = [...$docs, ...$nested_data];
41 | }
42 | }
43 | return $docs;
44 | }
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/ExtractorService/Contracts/TextProcessorHaveFilter.php:
--------------------------------------------------------------------------------
1 | utilsService->getFilePath();
20 | $reader = IOFactory::createReader($this->readerType);
21 | try {
22 | $presentation = $reader->load($file_path);
23 | return count($presentation->getAllSlides()) > 0;
24 | } catch (\Exception $exception) {
25 | throw $exception;
26 | report($exception);
27 | throw new TextractException(trans('textract::processor.error_unable_to_read', [
28 | 'path' => $this->utilsService->getFilePath()
29 | ]));
30 | }
31 |
32 | }
33 |
34 | /**
35 | * @throws TextractException
36 | * @throws \PhpOffice\PhpSpreadsheet\Reader\Exception
37 | */
38 | protected function getExtractedText(): string
39 | {
40 | if ($this->hasReadable()) {
41 | $data_iterable = [];
42 | $reader = IOFactory::createReader($this->readerType);
43 | $presentation = $reader->load($this->utilsService->getFilePath());
44 | foreach ($presentation->getAllSlides() as $slide) {
45 | $shapes = $slide->getShapeCollection();
46 | foreach ($shapes as $shape_k => $shape_v) {
47 | $shape = $shapes[$shape_k];
48 | if($shape instanceof Shape\RichText){
49 | $paragraphs = $shapes[$shape_k]->getParagraphs();
50 | foreach ($paragraphs as $paragraph_k => $paragraph_v) {
51 | $text_elements = $paragraph_v->getRichTextElements();
52 | foreach ($text_elements as $text_element_k => $text_element_v) {
53 | $data_iterable[] = $text_element_v->getText();
54 | }
55 | }
56 | }
57 | }
58 | }
59 | return implode("\n", $data_iterable);
60 | }
61 | return "";
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/ExtractorService/ExtractorCommonProcessors/PhpSheetProcessor.php:
--------------------------------------------------------------------------------
1 | utilsService->getFilePath();
19 | $has_readable = IOFactory::createReaderForFile($file_path)->canRead($file_path);
20 | if ($has_readable) {
21 | return true;
22 | }
23 | throw new TextractException(trans('textract::processor.error_unable_to_read', [
24 | 'path' => $this->utilsService->getFilePath()
25 | ]));
26 | }
27 |
28 | /**
29 | * @throws TextractException
30 | * @throws \PhpOffice\PhpSpreadsheet\Reader\Exception
31 | */
32 | protected function getExtractedText(): string
33 | {
34 | if ($this->hasReadable()) {
35 | $data_iterable = [];
36 | $spreadsheet = IOFactory::load($this->utilsService->getFilePath());
37 | foreach ($spreadsheet->getAllSheets() as $sheet) {
38 | foreach ($sheet->toArray() as $item) {
39 | $data_iterable[] = implode(',', array_filter($item));
40 | }
41 | }
42 | return implode("\n", $data_iterable);
43 | }
44 | return "";
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/ExtractorService/ExtractorCommonProcessors/PhpWordProcessor.php:
--------------------------------------------------------------------------------
1 | hasReadable()) {
23 | return $this->getSectionsText();
24 | }
25 | return "";
26 | }
27 |
28 | /**
29 | * Has the file is readable
30 | * @return bool
31 | * @throws TextractException
32 | * @throws PhpWordException|TextractException
33 | */
34 | private function hasReadable(): bool
35 | {
36 | $has_read_permission = IOFactory::createReader($this->reader_name)
37 | ->canRead($this->utilsService->getFilePath());
38 | if ($has_read_permission) {
39 | return true;
40 | }
41 | throw new TextractException(trans('textract::processor.error_unable_to_read', [
42 | 'path' => $this->utilsService->getFilePath()
43 | ]));
44 | }
45 |
46 | /**
47 | * Collect section wise text from the Word file
48 | * @return string
49 | * @throws TextractException
50 | */
51 | protected function getSectionsText(): string
52 | {
53 | $output = [];
54 | $phpWord = IOFactory::load($this->utilsService->getFilePath(), $this->reader_name);
55 | foreach ($phpWord->getSections() as $section) {
56 | $elements = $section->getElements();
57 | $output[] = $this->getElementText($elements);
58 | }
59 | return implode(" ", array_filter($output));
60 | }
61 |
62 | /**
63 | * @param array $elements
64 | * @return string
65 | */
66 | protected function getElementText(array $elements): string
67 | {
68 | $output = [];
69 | foreach ($elements as $element) {
70 | if ($element instanceof PhpWordElementText) {
71 | $output[] = trim($element->getText());
72 | }
73 | if ($element instanceof PhpWordElementTextRun) {
74 | $output[] = $this->getElementText($element->getElements());
75 | }
76 | }
77 | return implode(" ", $output);
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/ExtractorService/ExtractorCommonProcessors/TextProcessor.php:
--------------------------------------------------------------------------------
1 | hasReadable()) {
18 | $file_size = filesize($this->utilsService->getFilePath());
19 | $read_data = fread($file_resource, $file_size);
20 | fclose($file_resource);
21 | if ($this instanceof TextProcessorHaveFilter) {
22 | return $this->getExtractedText($read_data);
23 | }
24 | return $read_data;
25 | }
26 | return "";
27 | }
28 |
29 | /**
30 | * @return resource
31 | * @throws TextractException
32 | */
33 | private function hasReadable()
34 | {
35 | if ($file_resource = fopen($this->utilsService->getFilePath(), 'rb')) {
36 | return $file_resource;
37 | }
38 | throw new TextractException(trans('textract::processor.error_unable_to_read', [
39 | 'path' => $this->utilsService->getFilePath()
40 | ]));
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/HtmlExtractor.php:
--------------------------------------------------------------------------------
1 | getText();
26 | }
27 | return "";
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/ImageExtractor.php:
--------------------------------------------------------------------------------
1 | ocrOptions = $ocrOptions;
33 | return $this;
34 | }
35 |
36 | /**
37 | * @return string
38 | * @throws \Nilgems\PhpTextract\Exceptions\TextractException
39 | * @throws \thiagoalessio\TesseractOCR\TesseractOcrException
40 | */
41 | protected function getExtractedText(): string
42 | {
43 | return app(TesseractOcrRun::class)
44 | ->boot($this->utilsService, $this->ocrOptions);
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/MsOfficeDocExtractor.php:
--------------------------------------------------------------------------------
1 | hasOsExtensionInstalled()) {
24 | $file_path = $this->utilsService->getFilePath();
25 | $process = new Process(['pdftotext', '-layout', $file_path , '-']);
26 | $process->start();
27 | $process->wait();
28 | return $this->getFilteredOutput($process);
29 | }
30 | return "";
31 | }
32 |
33 | /**
34 | * Has 'pdftotext' extension is installed or enabled in OS.
35 | * @return bool
36 | * @throws TextractException
37 | */
38 | private function hasOsExtensionInstalled(): bool
39 | {
40 | $process = new Process(['pdftotext', '-v']);
41 | $process->start();
42 | $process->wait();
43 | $output = $this->getFilteredOutput($process);
44 | $has_extension = (bool) preg_match('/pdftotext([\s]+)version/', $output);
45 | if ($has_extension) {
46 | return true;
47 | }
48 | throw new TextractException(trans('extractor.error_pdf_of_extension_not_installed'));
49 | }
50 |
51 | /**
52 | * @param Process $process
53 | * @return string
54 | */
55 | private function getFilteredOutput(Process $process): string
56 | {
57 | $output = $process->getOutput();
58 | $output_error = $process->getErrorOutput();
59 | if (!empty($output)) {
60 | return $output;
61 | }
62 | return $output_error;
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/ExtractorService/Extractors/RtfExtractor.php:
--------------------------------------------------------------------------------
1 | options = [
16 | 'executable' => config('textract.ocr.executable_path', null),
17 | 'tempDir' => config('textract.ocr.temp_dir', null),
18 | 'userWords' => config('textract.ocr.text_dictionary_path'),
19 | 'userPatterns' => config('textract.ocr.text_patterns_path'),
20 | 'lang' => [],
21 | 'allowlist' => [],
22 | 'configVar' => config('textract.ocr.config'),
23 | 'psm' => null,
24 | 'dpi' => null,
25 | 'threadLimit' => config('textract.ocr.thread_limit'),
26 |
27 | ];
28 | }
29 |
30 | /**
31 | * @param string $path
32 | * @return $this
33 | */
34 | public function setTempDir(string $path): self
35 | {
36 | $this->options['tempDir'] = $path;
37 | return $this;
38 | }
39 |
40 | /**
41 | * Add languages
42 | * @param array $language
43 | * @return $this
44 | */
45 | public function setLanguage(array $language): self
46 | {
47 | $this->options['lang'] = $language;
48 | return $this;
49 | }
50 |
51 | /**
52 | * @param int $psm
53 | * @return $this
54 | */
55 | public function setPsm(int $psm): self
56 | {
57 | $this->options['psm'] = $psm;
58 | return $this;
59 | }
60 |
61 | /**
62 | * @param array $list
63 | * @return $this
64 | */
65 | public function setAllowList(array $list): self
66 | {
67 | $this->options['allowlist'] = $list;
68 | return $this;
69 | }
70 |
71 | public function toArray(): array
72 | {
73 | return array_filter($this->options, static function ($option_value) {
74 | return !empty($option_value);
75 | });
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/ExtractorService/Ocr/TesseractOcrRun.php:
--------------------------------------------------------------------------------
1 | utilsService = $utilsService;
25 | $is_enabled = config('textract.ocr.enabled', false);
26 | if ($is_enabled && $this->hasOsExtension() && $this->utilsService->getFilePath()) {
27 | return $this->getOcr($ocrOptions)->run();
28 | }
29 | return "";
30 | }
31 |
32 | protected function getOcr(TesseractOcrOptions $ocrOptions = null): TesseractOCR
33 | {
34 | if ($ocrOptions === null) {
35 | $ocrOptions = new TesseractOcrOptions();
36 | }
37 | $ocr = new TesseractOCR($this->utilsService->getFilePath());
38 | $ocr->withoutTempFiles();
39 | if ($ocrOptions) {
40 | foreach ($ocrOptions->toArray() as $option_key => $option_value) {
41 | if (is_array($option_value) || is_iterable($option_value)) {
42 | $ocr->{$option_key}(...$option_value);
43 | } else {
44 | $ocr->{$option_key}($option_value);
45 | }
46 | }
47 | }
48 | return $ocr;
49 | }
50 |
51 | /**
52 | * @return bool
53 | * @throws TextractException
54 | */
55 | protected function hasOsExtension(): bool
56 | {
57 | $tesseractPath = config('textract.ocr.executable_path', 'tesseract'); // C:\Program Files\Tesseract-OCR\tesseract.exe
58 | $process = new Process([$tesseractPath, '-v']);
59 | $process->start();
60 | $process->wait();
61 | $output = $this->getConsoleOutput($process);
62 | $has_installed = (bool) preg_match('/tesseract([\s]+)((v)?[0-9.]+)/', $output);
63 | if ($has_installed) {
64 | return true;
65 | }
66 | throw new TextractException(trans('textract::tesseract.error_not_installed'));
67 | }
68 |
69 | /**
70 | * @param Process $process
71 | * @return string
72 | */
73 | protected function getConsoleOutput(Process $process): string
74 | {
75 | if ($output = $process->getOutput()) {
76 | return $output;
77 | }
78 | return $process->getErrorOutput();
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/Providers/ServiceProvider.php:
--------------------------------------------------------------------------------
1 | publishes([
25 | __DIR__ . '/../../config/textract.php' => config_path('textract.php')
26 | ], 'textract');
27 | }
28 |
29 | /**
30 | * Register services
31 | * @return void
32 | */
33 | public function register(): void
34 | {
35 | $this->mergeConfigFrom(__DIR__ . '/../../config/textract.php', 'textract');
36 | $this->loadTranslationsFrom(__DIR__ . '/../../lang', 'textract');
37 |
38 | $this->app->bind(UtilsService::class);
39 | $this->app->bind('textract', ExtractService::class);
40 | $this->app->bind(ConsoleExtractionService::class);
41 |
42 | $this->registerExtractors();
43 | }
44 |
45 | /**
46 | * Register extractors to the application
47 | * @return void
48 | */
49 | protected function registerExtractors(): void
50 | {
51 | $extractors = [
52 | HtmlExtractor::class,
53 | ImageExtractor::class,
54 | MsOfficeDocExtractor::class,
55 | MsOfficeDocxExtractor::class,
56 | MsOfficePptxExtractor::class,
57 | OpenOfficeDocument::class,
58 | OpenOfficeSpreadSheet::class,
59 | PdfExtractor::class,
60 | RtfExtractor::class,
61 | TxtExtractor::class
62 | ];
63 | foreach ($extractors as $extractor) {
64 | $this->app->bind($extractor);
65 | }
66 |
67 | $this->app->tag($extractors, 'extractors');
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/Services/ConsoleExtractionService.php:
--------------------------------------------------------------------------------
1 | file_path = $file_path;
40 | $this->job_id = (string) ($job_id ?? Str::uuid());
41 | $this->utilsService = app(UtilsService::class);
42 | $this->utilsService->setFilePath($this->file_path);
43 | $output = $this->utilsService->getExtractor()->boot($this->utilsService);
44 | return new TextractOutput($output);
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/Services/ExtractService.php:
--------------------------------------------------------------------------------
1 | file_path = $file_path;
20 | $this->job_id = (string) ($job_id ?? Str::uuid());
21 | return app(ConsoleExtractionService::class)
22 | ->boot($this->file_path, $this->job_id);
23 | }
24 |
25 | /**
26 | * Get file path
27 | * @return string
28 | */
29 | public function getFilePath(): string
30 | {
31 | return $this->file_path;
32 | }
33 |
34 | /**
35 | * Get job id
36 | * @return string
37 | */
38 | public function getJobId(): string
39 | {
40 | return $this->job_id;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/Services/UtilsService.php:
--------------------------------------------------------------------------------
1 | file_path = $file_path;
52 | $this->file_name = $this->getFileName();
53 | $this->file_extension = $this->getFileExtension();
54 | $this->file_mime_type = $this->getFileMimeType();
55 | $this->extractor_collection = collect(app()->tagged('extractors'));
56 | $this->supported_file_extensions = (clone $this->extractor_collection)
57 | ->transform(function (AbstractTextExtractor $extractor) {
58 | return $extractor->supported_extension;
59 | })
60 | ->flatten()
61 | ->toArray();
62 | return $this;
63 | }
64 |
65 | /**
66 | * Get the file path
67 | * @return string
68 | * @throws TextractException
69 | */
70 | public function getFilePath(): string
71 | {
72 | if ($this->fileIsExists()) {
73 | return $this->file_path;
74 | }
75 | throw new TextractException(trans('textract::file.error_not_exists', ['path' => $this->file_path]));
76 | }
77 |
78 | /**
79 | * Is the extractable file exists/file path is valid or not
80 | * @return bool
81 | */
82 | protected function fileIsExists(): bool
83 | {
84 | if (isset($this->file_path)) {
85 | return file_exists($this->file_path);
86 | }
87 | return false;
88 | }
89 |
90 | /**
91 | * Get the extractor
92 | * @return AbstractTextExtractor
93 | * @throws TextractException
94 | */
95 | public function getExtractor(): AbstractTextExtractor
96 | {
97 | if (isset($this->file_mime_type)) {
98 | $selected_extractor = (clone $this->extractor_collection)
99 | ->filter(function (AbstractTextExtractor $extractor) {
100 | return in_array($this->file_extension, $extractor->supported_extension, true);
101 | });
102 | if ($selected_extractor->count() > 0) {
103 | return $selected_extractor->first();
104 | }
105 | throw new TextractException(
106 | "Invalid file format. Only support ".
107 | implode('/', $this->supported_file_extensions).
108 | " files"
109 | );
110 | }
111 | throw new TextractException("Please provide a file to extract text from that.");
112 | }
113 |
114 | /**
115 | * Get the file name from the file path
116 | * @return string|null
117 | */
118 | public function getFileName(): ?string
119 | {
120 | if (isset($this->file_path)) {
121 | return basename($this->file_path);
122 | }
123 | return null;
124 | }
125 |
126 | /**
127 | * Get file extension from the file path
128 | * @return string|null
129 | */
130 | public function getFileExtension(): ?string
131 | {
132 | if (isset($this->file_path)) {
133 | return strtolower(pathinfo($this->file_path, PATHINFO_EXTENSION));
134 | }
135 | return null;
136 | }
137 |
138 | /**
139 | * Get file mime type from the file
140 | * @return string|null
141 | */
142 | public function getFileMimeType(): ?string
143 | {
144 | if (isset($this->file_path)) {
145 | return mime_content_type($this->file_path);
146 | }
147 | return null;
148 | }
149 | }
150 |
--------------------------------------------------------------------------------
/src/Textract.php:
--------------------------------------------------------------------------------
1 | run($path);
15 | $this->assertIsInt($output->word_count);
16 | $this->assertIsString($output->text);
17 | $this->assertNotEmpty($output->text);
18 | }
19 |
20 | public function addExtractionData(): array
21 | {
22 | return [
23 | 'extracting doc' => [__DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'storage/example.xlsx']
24 | ];
25 | }
26 | }
27 |
--------------------------------------------------------------------------------