├── .flake8 ├── .github ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── documentation.yml │ ├── lambda_layers.yml │ ├── release-caller.yml │ ├── release.yml │ ├── test-pr-caller.yml │ ├── test-pr-geofinder.yml │ ├── test-pr-prettyprinter.yml │ └── tests.yml ├── .gitignore ├── .style.yapf ├── .yapfignore ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── NOTICE ├── README.md ├── caller ├── LICENSE ├── Manifest.in ├── README.md ├── setup.cfg ├── setup.py ├── tests │ ├── data │ │ ├── driverlicense.png │ │ ├── employmentapp.png │ │ ├── employmentapp.tiff │ │ ├── json_from_python_repl.json │ │ ├── multi_page_tiff.tiff │ │ └── verification-of-employment.png │ └── test_caller.py └── textractcaller │ ├── __init__.py │ ├── _version.py │ └── t_call.py ├── docs ├── Makefile ├── make.bat └── source │ ├── commandline.rst │ ├── conf.py │ ├── examples.rst │ ├── favicon.ico │ ├── images │ └── lambda_tutorial │ │ ├── 1b.png │ │ ├── 1c.png │ │ ├── 2.png │ │ ├── 2a.png │ │ ├── 2b.png │ │ ├── 3a.png │ │ └── 3c.png │ ├── index.rst │ ├── installation.rst │ ├── notebooks │ ├── document_linearization_to_markdown_or_html.ipynb │ ├── exporting_form_data.ipynb │ ├── finding_words_within_a_document.ipynb │ ├── going_further.ipynb │ ├── imgs │ │ └── excel.png │ ├── interfacing_with_trp2.ipynb │ ├── introduction_to_searching.ipynb │ ├── layout_analysis.ipynb │ ├── layout_analysis_for_text_linearization.ipynb │ ├── parsing_an_existing_response.ipynb │ ├── signature_detection.ipynb │ ├── simple_ocr.ipynb │ ├── table_data_to_various_formats.ipynb │ ├── tabular_data_linearization.ipynb │ ├── tabular_data_linearization_continued.ipynb │ ├── textractor_for_large_language_models.ipynb │ ├── using_analyze_expense.ipynb │ ├── using_analyze_id.ipynb │ ├── using_queries.ipynb │ └── visualizing_results.ipynb │ ├── overlayer.png │ ├── overlayer_bigger.png │ ├── textractor.data.constants.rst │ ├── textractor.data.text_linearization_config.rst │ ├── textractor.entities.rst │ ├── textractor.parsers.rst │ ├── textractor.rst │ ├── textractor.visualizers.rst │ ├── textractor_cropped.png │ └── using_in_lambda.rst ├── extras ├── dev.txt ├── docs.txt ├── pandas.txt ├── pdf.txt ├── pdfium.txt └── torch.txt ├── helper ├── LICENSE ├── Manifest.in ├── README.md ├── bin │ └── amazon-textract ├── docs │ ├── employmentapp_boxed_FORM_CELL_.png │ ├── employmentapp_boxed_LINE_TEXT_OVERLAY.png │ ├── employmentapp_boxed_WORD_.png │ └── employmentapp_boxed_WORD_TEXT_OVERLAY.png ├── fonts │ └── Roboto-Regular.ttf ├── nice_textract.json ├── setup.cfg ├── setup.py ├── textract.json └── textracthelper │ ├── .gitignore │ ├── __init__.py │ ├── _version.py │ └── examples │ └── employmentapp.png ├── idp_cdk_manifest ├── .gitignore ├── LICENSE ├── Manifest.in ├── README.md ├── output.tar.gz ├── setup.cfg ├── setup.py ├── tests │ ├── data │ │ ├── analyze_id.json │ │ ├── manifest_all_features.json │ │ ├── manifest_default.json │ │ ├── manifest_minimal.json │ │ ├── manifest_queries_no_alias.json │ │ ├── manifest_queries_no_pages.json │ │ ├── manifest_with_classification.json │ │ ├── manifest_with_classification_and_metadata.json │ │ ├── queries_forms.json │ │ └── simple_feature_manifest.json │ └── test_manifest.py └── textractmanifest │ ├── __init__.py │ └── manifest.py ├── images └── amzn.png ├── overlayer ├── LICENSE ├── Manifest.in ├── README.md ├── setup.cfg ├── setup.py ├── tests │ ├── data │ │ └── Amazon-Textract-Pdf.pdf │ └── test_overlayer.py └── textractoverlayer │ ├── __init__.py │ ├── _version.py │ ├── image_tools.py │ └── t_overlay.py ├── prettyprinter ├── LICENSE ├── Manifest.in ├── README.md ├── setup.cfg ├── setup.py ├── tests │ ├── data │ │ ├── analyzeDocResponse.json │ │ ├── bounding_box_issue.json │ │ ├── employmentapp.json │ │ ├── layout_csv_example.json │ │ ├── lending-doc-output_from_output_config.json │ │ ├── multi_page_example_file.json │ │ ├── queries_one_no_answer.json │ │ ├── request_for_verification_of_employment.json │ │ └── w2-example.json │ └── test_pretty_print.py └── textractprettyprinter │ ├── __init__.py │ ├── _version.py │ ├── t_pretty_print.py │ ├── t_pretty_print_expense.py │ └── t_pretty_print_layout.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── fixtures │ ├── amzn_q2.png │ ├── fake_id.png │ ├── form.png │ ├── form_1005.png │ ├── in-table-title.png │ ├── invalid.pdf │ ├── invoice.png │ ├── matrix.png │ ├── multiline_cells.jpeg │ ├── patient_intake_form_sample.png │ ├── paystub.jpg │ ├── paystub_header.png │ ├── paystub_single_table.png │ ├── paystub_tables.png │ ├── reading_order.pdf │ ├── receipt.jpg │ ├── receipt_no_summary.png │ ├── resume.png │ ├── sample-invoice.pdf │ ├── saved_api_responses │ │ ├── test_analyze_expense_from_image.json │ │ ├── test_analyze_expense_from_path.json │ │ ├── test_analyze_expense_no_summary_fields.json │ │ ├── test_analyze_id_from_image.json │ │ ├── test_analyze_id_from_path.json │ │ ├── test_bad_queries_as_strings.json │ │ ├── test_detect_document_text.json │ │ ├── test_detect_document_text_list_PIL_images.json │ │ ├── test_detect_document_text_single_page_pdf_input.json │ │ ├── test_detect_no_duplicate_words_amzn_q2.png.json │ │ ├── test_detect_no_duplicate_words_fake_id.png.json │ │ ├── test_detect_no_duplicate_words_form.png.json │ │ ├── test_detect_no_duplicate_words_form_1005.png.json │ │ ├── test_detect_no_duplicate_words_in-table-title.png.json │ │ ├── test_detect_no_duplicate_words_matrix.png.json │ │ ├── test_detect_no_duplicate_words_patient_intake_form_sample.png.json │ │ ├── test_detect_no_duplicate_words_paystub.jpg.json │ │ ├── test_detect_no_duplicate_words_paystub_header.png.json │ │ ├── test_detect_no_duplicate_words_paystub_single_table.png.json │ │ ├── test_detect_no_duplicate_words_paystub_tables.png.json │ │ ├── test_detect_no_duplicate_words_reading_order.pdf.json │ │ ├── test_detect_no_duplicate_words_receipt.jpg.json │ │ ├── test_detect_no_duplicate_words_sample-invoice.pdf.json │ │ ├── test_detect_no_duplicate_words_screenshot.png.json │ │ ├── test_detect_no_duplicate_words_single-page-1.png.json │ │ ├── test_detect_no_duplicate_words_single-page-2.png.json │ │ ├── test_detect_no_duplicate_words_test.png.json │ │ ├── test_detect_no_duplicate_words_textractor-singlepage-doc.pdf.json │ │ ├── test_detect_no_duplicate_words_tutorial.pdf.json │ │ ├── test_document_smoke_test.json │ │ ├── test_document_to_html_amzn_q2.png.json │ │ ├── test_document_to_html_fake_id.png.json │ │ ├── test_document_to_html_form.png.json │ │ ├── test_document_to_html_form_1005.png.json │ │ ├── test_document_to_html_in-table-title.png.json │ │ ├── test_document_to_html_matrix.png.json │ │ ├── test_document_to_html_patient_intake_form_sample.png.json │ │ ├── test_document_to_html_paystub.jpg.json │ │ ├── test_document_to_html_paystub_header.png.json │ │ ├── test_document_to_html_paystub_single_table.png.json │ │ ├── test_document_to_html_paystub_tables.png.json │ │ ├── test_document_to_html_reading_order.pdf.json │ │ ├── test_document_to_html_receipt.jpg.json │ │ ├── test_document_to_html_sample-invoice.pdf.json │ │ ├── test_document_to_html_screenshot.png.json │ │ ├── test_document_to_html_single-page-1.png.json │ │ ├── test_document_to_html_single-page-2.png.json │ │ ├── test_document_to_html_test.png.json │ │ ├── test_document_to_html_textractor-singlepage-doc.pdf.json │ │ ├── test_document_to_html_tutorial.pdf.json │ │ ├── test_document_to_markdown_amzn_q2.png.json │ │ ├── test_document_to_markdown_fake_id.png.json │ │ ├── test_document_to_markdown_form.png.json │ │ ├── test_document_to_markdown_form_1005.png.json │ │ ├── test_document_to_markdown_in-table-title.png.json │ │ ├── test_document_to_markdown_matrix.png.json │ │ ├── test_document_to_markdown_patient_intake_form_sample.png.json │ │ ├── test_document_to_markdown_paystub_header.png.json │ │ ├── test_document_to_markdown_paystub_single_table.png.json │ │ ├── test_document_to_markdown_paystub_tables.png.json │ │ ├── test_document_to_markdown_reading_order.pdf.json │ │ ├── test_document_to_markdown_receipt.jpg.json │ │ ├── test_document_to_markdown_sample-invoice.pdf.json │ │ ├── test_document_to_markdown_screenshot.png.json │ │ ├── test_document_to_markdown_single-page-1.png.json │ │ ├── test_document_to_markdown_single-page-2.png.json │ │ ├── test_document_to_markdown_test.png.json │ │ ├── test_document_to_markdown_textractor-singlepage-doc.pdf.json │ │ ├── test_figure_layout_prefixes_and_suffixes_in_text_words.json │ │ ├── test_layout.json │ │ ├── test_page.json │ │ ├── test_queries_as_strings.json │ │ ├── test_signature.json │ │ ├── test_table.json │ │ ├── test_table_prefixes_and_suffixes_in_text.json │ │ ├── test_table_prefixes_and_suffixes_in_words.json │ │ ├── test_table_with_title_and_footers.json │ │ ├── test_textractor_analyze_document.json │ │ ├── test_textractor_analyze_document_local_pillow_image.json │ │ ├── test_textractor_analyze_document_multipage_pdf.json │ │ ├── test_textractor_analyze_document_pillow_image_list.json │ │ ├── test_textractor_s3_image_input.json │ │ ├── test_textractor_start_document_analysis.json │ │ ├── test_textractor_start_document_analysis_multipage_pdf_s3.json │ │ ├── test_textractor_start_document_text_detection.json │ │ ├── test_textractor_start_document_text_detection_multipage_pdf_s3.json │ │ └── test_word_ordering_in_cell.json │ ├── screenshot.png │ ├── signature.jpg │ ├── single-page-1.png │ ├── single-page-2.png │ ├── test.png │ ├── textractor-multipage-doc.pdf │ ├── textractor-singlepage-doc.pdf │ ├── titanic.webp │ ├── tutorial.pdf │ ├── vbat.png │ └── vbat2.png ├── invoice_sample.pdf ├── test_analyze_expense.py ├── test_analyze_id.py ├── test_bbox.py ├── test_document.py ├── test_get_text_and_words.py ├── test_key_value.py ├── test_layout.py ├── test_line.py ├── test_page.py ├── test_parse_no_fail.py ├── test_queries.py ├── test_selection_element.py ├── test_signature.py ├── test_table.py ├── test_textractor.py ├── test_textractor_cli.py ├── test_value.py ├── test_visualizer.py ├── test_word.py ├── test_word_ordering.py └── utils.py ├── textractor ├── __init__.py ├── cli │ ├── __init__.py │ └── cli.py ├── data │ ├── __init__.py │ ├── constants.py │ ├── html_linearization_config.py │ ├── markdown_linearization_config.py │ └── text_linearization_config.py ├── entities │ ├── __init__.py │ ├── bbox.py │ ├── document.py │ ├── document_entity.py │ ├── expense_document.py │ ├── expense_field.py │ ├── identity_document.py │ ├── identity_field.py │ ├── key_value.py │ ├── layout.py │ ├── lazy_document.py │ ├── line.py │ ├── linearizable.py │ ├── page.py │ ├── page_layout.py │ ├── query.py │ ├── query_result.py │ ├── selection_element.py │ ├── signature.py │ ├── table.py │ ├── table_cell.py │ ├── table_footer.py │ ├── table_title.py │ ├── value.py │ └── word.py ├── exceptions.py ├── parsers │ ├── __init__.py │ └── response_parser.py ├── textractor.py ├── utils │ ├── __init__.py │ ├── geometry_util.py │ ├── html_utils.py │ ├── legacy_utils.py │ ├── pdf_utils.py │ ├── results_utils.py │ ├── s3_utils.py │ ├── search_utils.py │ └── text_utils.py └── visualizers │ ├── __init__.py │ ├── arial.ttf │ └── entitylist.py ├── tpipelinegeofinder ├── LICENSE ├── Manifest.in ├── README.md ├── geofinder-sample-notebook.ipynb ├── setup.cfg ├── setup.py ├── tests │ ├── data │ │ ├── multi_page_example_file.json │ │ ├── multi_page_example_file.pdf │ │ ├── patient_intake_form_sample.jpg │ │ ├── patient_intake_form_sample.json │ │ ├── test_sample.json │ │ └── tquery_samples.json │ ├── test_ocrdb.py │ ├── test_tgeofinder.py │ └── test_tword.py └── textractgeofinder │ ├── __init__.py │ ├── _version.py │ ├── ocrdb.py │ ├── sample_patient_intake_form_parser.py │ ├── tgeofinder.py │ ├── tinterface.py │ └── tword.py └── tpipelinepagedimensions ├── LICENSE ├── Manifest.in ├── README.md ├── setup.cfg ├── setup.py ├── tests ├── data │ └── Textract-orginal-2021-05-10.png └── test_pagedimensions.py └── textractpagedimensions ├── __init__.py ├── _version.py └── t_pagedimensions.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | *Issue #, if available:* 2 | 3 | *Description of changes:* 4 | 5 | 6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. 7 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | 3 | on: 4 | push: 5 | branches: [ 'master' ] 6 | pull_request: 7 | 8 | workflow_dispatch: 9 | 10 | # Compile the docs and deploy to GitHub pages 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | # Checks out the repository 17 | - uses: actions/checkout@v3 18 | with: 19 | ref: 'master' 20 | 21 | - name: Install pandoc 22 | run: sudo apt-get install -y pandoc 23 | 24 | # Upgrade pip 25 | - name: Upgrade pip 26 | run: | 27 | # install pip=>20.1 to use "pip cache dir" 28 | python3 -m pip install --upgrade pip 29 | 30 | # Cache dependencies 31 | - name: Get pip cache dir 32 | id: pip-cache 33 | run: echo "dir=$(pip cache dir)" >> "$GITHUB_OUTPUT" 34 | 35 | - name: Cache dependencies 36 | uses: actions/cache@v4 37 | with: 38 | path: ${{ steps.pip-cache.outputs.dir }} 39 | key: ${{ runner.os }}-pip-${{ hashFiles('**/extras/docs.txt') }} 40 | restore-keys: | 41 | ${{ runner.os }}-pip- 42 | 43 | # Install base dependencies 44 | - name: Install dependencies 45 | run: python3 -m pip install -r requirements.txt 46 | 47 | # Install sphinx 48 | - name: Install dependencies 49 | run: python3 -m pip install -r ./extras/docs.txt 50 | 51 | # Make docs 52 | - name: Build docs 53 | run: cd docs && make html 54 | 55 | # Deploy 56 | - name: Deploy 57 | uses: peaceiris/actions-gh-pages@v3 58 | with: 59 | github_token: ${{ secrets.GITHUB_TOKEN }} 60 | publish_dir: ./docs/build/html/ 61 | -------------------------------------------------------------------------------- /.github/workflows/release-caller.yml: -------------------------------------------------------------------------------- 1 | name: Release Caller 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Main release"] 6 | types: [completed] 7 | 8 | workflow_dispatch: 9 | 10 | # Package and upload the Python package 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | # Checks out the repository 17 | - uses: actions/checkout@v3 18 | with: 19 | ref: "master" 20 | 21 | # Upgrade pip 22 | - name: Upgrade pip 23 | run: | 24 | # install pip=>20.1 to use "pip cache dir" 25 | python3 -m pip install --upgrade pip 26 | 27 | # Cache dependencies 28 | - name: Get pip cache dir 29 | id: pip-cache 30 | run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT 31 | 32 | - name: Cache dependencies 33 | uses: actions/cache@v3 34 | with: 35 | path: ${{ steps.pip-cache.outputs.dir }} 36 | key: ${{ runner.os }}-pip-${{ hashFiles('**/extras/docs.txt') }} 37 | restore-keys: | 38 | ${{ runner.os }}-pip- 39 | 40 | # Install twine 41 | - name: Install dependencies 42 | run: python3 -m pip install twine 43 | 44 | # Make docs 45 | - name: Build sdist and wheels 46 | run: | 47 | cd caller 48 | python3 setup.py bdist_wheel 49 | python3 setup.py sdist 50 | mv dist .. 51 | 52 | # Upload to PyPI 53 | - name: Publish distribution to PyPI 54 | uses: pypa/gh-action-pypi-publish@release/v1 55 | with: 56 | password: ${{ secrets.PYPI_API_KEY_CALLER }} 57 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Main release"] 6 | types: [completed] 7 | 8 | workflow_dispatch: 9 | 10 | # Package and upload the Python package 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | # Checks out the repository 17 | - uses: actions/checkout@v3 18 | with: 19 | ref: "master" 20 | 21 | # Upgrade pip 22 | - name: Upgrade pip 23 | run: | 24 | # install pip=>20.1 to use "pip cache dir" 25 | python3 -m pip install --upgrade pip 26 | 27 | # Cache dependencies 28 | - name: Get pip cache dir 29 | id: pip-cache 30 | run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT 31 | 32 | - name: Cache dependencies 33 | uses: actions/cache@v3 34 | with: 35 | path: ${{ steps.pip-cache.outputs.dir }} 36 | key: ${{ runner.os }}-pip-${{ hashFiles('**/extras/docs.txt') }} 37 | restore-keys: | 38 | ${{ runner.os }}-pip- 39 | 40 | # Install twine 41 | - name: Install dependencies 42 | run: python3 -m pip install twine 43 | 44 | # Make docs 45 | - name: Build sdist and wheels 46 | run: | 47 | python3 setup.py bdist_wheel 48 | python3 setup.py sdist 49 | 50 | # Upload to PyPI 51 | - name: Publish distribution to PyPI 52 | uses: pypa/gh-action-pypi-publish@release/v1 53 | with: 54 | password: ${{ secrets.PYPI_API_KEY_TEXTRACTOR }} 55 | -------------------------------------------------------------------------------- /.github/workflows/test-pr-caller.yml: -------------------------------------------------------------------------------- 1 | # Controls when the action will run. Triggers the workflow on push or pull request 2 | # events but only for the main branch and changes in folder src-python 3 | name: Test-Pull-Request-Caller 4 | on: 5 | pull_request: 6 | paths: 7 | - caller 8 | workflow_dispatch: {} 9 | 10 | # Run the tests 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 17 | defaults: 18 | run: 19 | working-directory: ./caller 20 | permissions: 21 | id-token: write 22 | contents: write 23 | 24 | steps: 25 | # Checks out the repository 26 | - uses: actions/checkout@v3 27 | - name: configure aws credentials 28 | uses: aws-actions/configure-aws-credentials@v1-node16 29 | with: 30 | role-to-assume: arn:aws:iam::913165245630:role/GithubActionsOIDC-Role-1U7IPQFU9Q8RS 31 | role-duration-seconds: 900 # the ttl of the session, in seconds. 32 | aws-region: us-east-1 # use your region here. 33 | - name: Set up Python ${{ matrix.python-version }} 34 | uses: actions/setup-python@v4 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | 38 | # Install package locally 39 | - name: Install package 40 | run: python -m pip install -e . 41 | 42 | # Install dev dependencies 43 | - name: Install dependencies 44 | run: | 45 | python -m pip install --upgrade pip 46 | python -m pip install pytest 47 | # Run tests 48 | - name: Test 49 | run: pytest 50 | -------------------------------------------------------------------------------- /.github/workflows/test-pr-geofinder.yml: -------------------------------------------------------------------------------- 1 | # Controls when the action will run. Triggers the workflow on push or pull request 2 | # events but only for the main branch and changes in folder src-python 3 | name: Test-Pull-Request-Geofinder 4 | on: 5 | pull_request: 6 | paths: 7 | - tpipelinegeofinder 8 | workflow_dispatch: 9 | 10 | # Run the tests 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 17 | defaults: 18 | run: 19 | working-directory: ./tpipelinegeofinder 20 | 21 | steps: 22 | # Checks out the repository 23 | - uses: actions/checkout@v3 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | 29 | # Install package locally 30 | - name: Install package 31 | run: python -m pip install -e . 32 | 33 | # Install dev dependencies 34 | - name: Install dependencies 35 | run: | 36 | python -m pip install --upgrade pip 37 | pip install pytest 38 | # Run tests 39 | - name: Test 40 | run: pytest 41 | -------------------------------------------------------------------------------- /.github/workflows/test-pr-prettyprinter.yml: -------------------------------------------------------------------------------- 1 | # Controls when the action will run. Triggers the workflow on push or pull request 2 | # events but only for the main branch and changes in folder src-python 3 | name: Test-Pull-Request-PrettyPrinter 4 | on: 5 | pull_request: 6 | paths: 7 | - prettyprinter 8 | 9 | workflow_dispatch: {} 10 | 11 | # Run the tests 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 18 | defaults: 19 | run: 20 | working-directory: ./prettyprinter 21 | steps: 22 | # Checks out the repository 23 | - uses: actions/checkout@v3 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | ref: ${{ github.event.pull_request.head.ref }} 29 | repository: ${{ github.event.pull_request.head.repo.full_name }} 30 | # Install package locally 31 | - name: Install package 32 | run: python -m pip install -e . 33 | # Install dev dependencies 34 | - name: Install dependencies 35 | run: | 36 | python -m pip install --upgrade pip 37 | python -m pip install pytest 38 | # Run tests 39 | - name: Test 40 | run: pytest 41 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | # TODO: Change the branch to master once merged. 5 | push: 6 | branches: [ 'master' ] 7 | pull_request: 8 | 9 | workflow_dispatch: 10 | 11 | # Run the tests 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | # Checks out the repository 18 | - uses: actions/checkout@v3 19 | with: 20 | ref: 'master' 21 | 22 | # Upgrade pip 23 | - name: Upgrade pip 24 | run: | 25 | # install pip=>20.1 to use "pip cache dir" 26 | python3 -m pip install --upgrade pip 27 | 28 | # Cache dependencies 29 | - name: Get pip cache dir 30 | id: pip-cache 31 | run: echo "dir=$(pip cache dir)" >> "$GITHUB_OUTPUT" 32 | 33 | - name: Cache dependencies 34 | uses: actions/cache@v4 35 | with: 36 | path: ${{ steps.pip-cache.outputs.dir }} 37 | key: ${{ runner.os }}-pip-${{ hashFiles('**/extras/docs.txt') }} 38 | restore-keys: | 39 | ${{ runner.os }}-pip- 40 | 41 | # Install base dependencies 42 | - name: Install dependencies 43 | run: python3 -m pip install -r requirements.txt 44 | 45 | # Install dev dependencies 46 | - name: Install dependencies 47 | run: python3 -m pip install -r ./extras/dev.txt 48 | 49 | # Run tests 50 | - name: Test 51 | run: pytest tests/ 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | .idea/* 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # dotenv 88 | .env 89 | 90 | # virtualenv 91 | .venv 92 | venv/ 93 | ENV/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # remove test with fixed input-document locations on S3 till we add a Textract and S3 mock 109 | test_local* 110 | 111 | share/python-wheels/ 112 | *.egg-info/ 113 | .installed.cfg 114 | *.egg 115 | MANIFEST 116 | .# 117 | 118 | .dir-locals.el 119 | 120 | .vscode 121 | 122 | .envrc 123 | env/ 124 | env2/ 125 | env3/ 126 | env4/ 127 | env5/ 128 | *.csv 129 | lambda_layer/* 130 | textractor.zip 131 | python/* 132 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = pep8 3 | spaces_before_comment = 4 4 | split_before_logical_operator = true 5 | column_limit: 120 6 | -------------------------------------------------------------------------------- /.yapfignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/.yapfignore -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: Amazon Textractor 6 | message: >- 7 | If you use this software, please cite it using the 8 | metadata from this file. 9 | type: software 10 | authors: 11 | - given-names: Edouard 12 | family-names: Belval 13 | affiliation: AWS AI 14 | - given-names: Thomas 15 | family-names: Delteil 16 | affiliation: AWS AI 17 | - given-names: Martin 18 | family-names: Schade 19 | affiliation: AWS AI 20 | - given-names: Srividhya 21 | family-names: Radhakrishna 22 | affiliation: AWS AI 23 | repository-code: 'https://github.com/aws-samples/amazon-textract-textractor' 24 | url: 'https://aws-samples.github.io/amazon-textract-textractor/' 25 | license: Apache-2.0 -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check [existing open](https://github.com/aws-samples/amazon-textract-textractor/issues), or [recently closed](https://github.com/aws-samples/amazon-textract-textractor/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/amazon-textract-textractor/labels/help%20wanted) issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](https://github.com/aws-samples/amazon-textract-textractor/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | recursive-include extras *.txt 3 | recursive-include textractor * -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Amazon Textract Textractor 2 | Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /caller/Manifest.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | 3 | recursive-exclude * __pycache__ 4 | recursive-exclude * *.py[co] test_*.py 5 | 6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif -------------------------------------------------------------------------------- /caller/setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.2.2 3 | commit = False 4 | tag = False 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:textractcaller/_version.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | -------------------------------------------------------------------------------- /caller/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | 10 | requirements = ['boto3>=1.26.35', 'botocore', 'amazon-textract-response-parser>=0.1.39'] 11 | 12 | if sys.argv[-1] == 'publish-test': 13 | os.system(f"cd {os.path.dirname(__file__)}") 14 | os.system('rm -rf dist/ build/ amazon_textract_caller.egg-info/') 15 | os.system('python setup.py sdist bdist_wheel') 16 | os.system('twine check dist/*') 17 | os.system('twine upload --repository pypitest dist/*') 18 | sys.exit() 19 | 20 | if sys.argv[-1] == 'publish': 21 | os.system(f"cd {os.path.dirname(__file__)}") 22 | os.system('rm -rf dist/ build/ amazon_textract_caller.egg-info/') 23 | os.system('python setup.py sdist bdist_wheel') 24 | os.system('twine check dist/*') 25 | os.system('twine upload --repository pypi dist/*') 26 | sys.exit() 27 | 28 | setup(name='amazon-textract-caller', 29 | packages=find_packages(exclude=['tests']), 30 | include_package_data=True, 31 | exclude_package_data={"": ["test_*.py", "__pycache__"]}, 32 | version='0.2.4', 33 | description='Amazon Textract Caller tools', 34 | install_requires=requirements, 35 | extras_require={'testing': ['amazon-textract-response-parser', 'pytest']}, 36 | long_description_content_type='text/markdown', 37 | long_description=read('README.md'), 38 | author='Amazon Rekognition Textract Demoes', 39 | author_email='rekognition-textract-demos@amazon.com', 40 | url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/caller', 41 | keywords='amazon-textract-textractor amazon textract textractor helper caller', 42 | license="Apache License Version 2.0", 43 | classifiers=[ 44 | "Development Status :: 4 - Beta", 45 | "Topic :: Utilities", 46 | 'License :: OSI Approved :: Apache Software License', 47 | 'Programming Language :: Python :: 3.8', 48 | 'Programming Language :: Python :: 3.9', 49 | 'Programming Language :: Python :: 3.10', 50 | 'Programming Language :: Python :: 3.11', 51 | 'Programming Language :: Python :: 3.12', 52 | ], 53 | python_requires='>=3.6') 54 | -------------------------------------------------------------------------------- /caller/tests/data/driverlicense.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/caller/tests/data/driverlicense.png -------------------------------------------------------------------------------- /caller/tests/data/employmentapp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/caller/tests/data/employmentapp.png -------------------------------------------------------------------------------- /caller/tests/data/employmentapp.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/caller/tests/data/employmentapp.tiff -------------------------------------------------------------------------------- /caller/tests/data/multi_page_tiff.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/caller/tests/data/multi_page_tiff.tiff -------------------------------------------------------------------------------- /caller/tests/data/verification-of-employment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/caller/tests/data/verification-of-employment.png -------------------------------------------------------------------------------- /caller/textractcaller/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | from .t_call import NotificationChannel, OutputConfig, DocumentLocation, Document, get_job_response, get_full_json_from_output_config, get_full_json, call_textract, Textract_Features, call_textract_analyzeid, DocumentPage, QueriesConfig, Query, AdaptersConfig, Adapter, call_textract_expense, Textract_Call_Mode, Textract_API, Textract_Types, call_textract_lending, get_full_json_lending, get_full_json_lending_from_output_config, get_s3_output_config_keys 3 | 4 | import logging 5 | from logging import NullHandler 6 | 7 | logging.getLogger(__name__).addHandler(NullHandler()) 8 | -------------------------------------------------------------------------------- /caller/textractcaller/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.2.2' 2 | 3 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/commandline.rst: -------------------------------------------------------------------------------- 1 | CLI 2 | === 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | Textractor comes with its very own command line interface that aims to be easier to use than the default `boto3` interface by adding several quality of life improvements. 8 | 9 | First install the package using :code:`pip install amazon-textract-textractor` make sure that you Python bin directory is added to PATH otherwise it will not find the executable. If you are not using a virtual environment this will probably be the case. 10 | 11 | Available APIs 12 | ______________ 13 | 14 | :code:`Textractor` supports all Textract APIs and follow their official names as described here: https://docs.aws.amazon.com/textract/latest/dg/API_Operations.html. We use a single subcommand to fetch the results named :code:`GetResult`. 15 | 16 | Synchronous APIs: 17 | 18 | - DetectDocumentText/detect-document-text (Returns words and lines) 19 | - AnalyzeDocument/analyze-document (Returns Forms, Tables and Query results) 20 | - AnalyzeExpense/analyze-expense (Returns standardized fields for invoices) 21 | - AnalyzeID/analyze-id (Returns standardized fields for driver's license and passports) 22 | 23 | Asynchronous APIs: 24 | 25 | - StartDocumentTextDetection/start-document-text-detection 26 | - StartDocumentAnalysis/start-document-analysis 27 | - StartExpenseAnalysis/start-expense-analysis 28 | 29 | Getting document text 30 | _____________________ 31 | 32 | Now lets say you have a file and you wish to run OCR on it: 33 | 34 | :code:`textractor detect-document-text your_file.png output.json` 35 | 36 | This will call the Textract API and save the output to :code:`output.json`. You could use the Textractor python module to post-process those response afterwards. 37 | 38 | Processing a directory of files 39 | _______________________________ 40 | 41 | Now if instead of a file, you wished to process an entire directory of files. You could call the above on every file in the directory, but this would prove to be a very long process. Instead you can leverage Textract's ability to scale to your workload using the asynchronous API. 42 | 43 | :code:`ls your_dir/ | xargs -I{} textractor start-document-text-detection {} --s3-upload-path s3://your-bucket/your-prefix/{}` 44 | 45 | You can also parallelize it simply by adding -P8 (for 8 concurrent processes). 46 | 47 | :code:`ls your_dir/ | xargs -P8 -I{} textractor start-document-text-detection {} --s3-upload-path s3://your-bucket/your-prefix/{} > output.txt` 48 | 49 | You will notice that all you have in output.txt are UUID like this: :code:`628e39089ffa1b52d62d980ec1cf4f62cb7f785c83a708b2e17ebaaf21ad0d61`. Those are JobIDs and can be used to fetch the output of asynchronous operations. 50 | 51 | Wait a few minutes (dependending on the number of files your processed) and then fetch the result with :code:`GetResult`. 52 | 53 | :code:`cat output.txt | xargs -I{} textractor get-result {} DETECT_TEXT {}.json` 54 | 55 | Using :code:`-P8` would make the above faster, but be careful not to increase the concurrent process count too much as you might run into rate limiting issues (See https://docs.aws.amazon.com/textract/latest/dg/limits.html for more details). 56 | 57 | Visualizing the output 58 | ______________________ 59 | 60 | The :code:`textractor` CLI allows you to overlay the output of Amazon Textract on top of an image for troubleshooting. It is only available for synchronous APIs (DetectDocumentText, AnalyzeDocument) and allows you to visualize words, lines, key and values, and tables. 61 | 62 | In this example we will overlay words and tables on top of the :code:`tests/fixtures/amzn_q2.png` file. The image will be created in the same directory as the :code:`output.json` file under the name :code:`output.json.png`. 63 | 64 | :code:`textractor analyze-document tests/fixtures/amzn_q2.png output.json --features TABLES --overlay WORDS TABLES` 65 | 66 | This will yield the following (click to enlarge): 67 | 68 | .. image:: overlayer.png 69 | :width: 600 70 | :alt: Overlayer output 71 | 72 | This document has a lot of small words, making it difficult to read. You can add :code:`--font-size-ratio` to the command to increase the font size. 73 | 74 | :code:`textractor analyze-document tests/fixtures/amzn_q2.png output.json --features TABLES --overlay WORDS TABLES --font-size-ratio 1.0` (default it 0.75) 75 | 76 | .. image:: overlayer_bigger.png 77 | :width: 600 78 | :alt: Overlayer output bigger 79 | 80 | Reference 81 | _________ 82 | 83 | .. argparse:: 84 | :ref: textractor.cli.cli._build_parser 85 | :prog: textractor -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("../../")) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = "amazon-textract-textractor" 22 | copyright = "2022, Amazon" 23 | author = "Edouard Belval" 24 | 25 | # The full version, including alpha/beta/rc tags 26 | release = "1.0.0" 27 | html_favicon = "favicon.ico" 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | "sphinx.ext.autodoc", 36 | "nbsphinx", 37 | "sphinxarg.ext", 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ["_templates"] 42 | 43 | # List of patterns, relative to source directory, that match files and 44 | # directories to ignore when looking for source files. 45 | # This pattern also affects html_static_path and html_extra_path. 46 | exclude_patterns = [] 47 | 48 | 49 | # -- Options for HTML output ------------------------------------------------- 50 | 51 | # The theme to use for HTML and HTML Help pages. See the documentation for 52 | # a list of builtin themes. 53 | # 54 | html_theme = "sphinx_rtd_theme" 55 | 56 | # Add any paths that contain custom static files (such as style sheets) here, 57 | # relative to this directory. They are copied after the builtin static files, 58 | # so a file named "default.css" will overwrite the builtin "default.css". 59 | html_static_path = ["_static"] 60 | -------------------------------------------------------------------------------- /docs/source/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | notebooks/simple_ocr 8 | notebooks/parsing_an_existing_response 9 | notebooks/introduction_to_searching 10 | notebooks/visualizing_results 11 | notebooks/finding_words_within_a_document 12 | notebooks/exporting_form_data 13 | notebooks/table_data_to_various_formats 14 | notebooks/using_analyze_expense 15 | notebooks/using_analyze_id 16 | notebooks/using_queries 17 | notebooks/layout_analysis 18 | notebooks/tabular_data_linearization 19 | notebooks/tabular_data_linearization_continued 20 | notebooks/layout_analysis_for_text_linearization 21 | notebooks/document_linearization_to_markdown_or_html 22 | notebooks/textractor_for_large_language_models 23 | notebooks/interfacing_with_trp2 24 | notebooks/signature_detection 25 | notebooks/going_further 26 | -------------------------------------------------------------------------------- /docs/source/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/favicon.ico -------------------------------------------------------------------------------- /docs/source/images/lambda_tutorial/1b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/1b.png -------------------------------------------------------------------------------- /docs/source/images/lambda_tutorial/1c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/1c.png -------------------------------------------------------------------------------- /docs/source/images/lambda_tutorial/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/2.png -------------------------------------------------------------------------------- /docs/source/images/lambda_tutorial/2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/2a.png -------------------------------------------------------------------------------- /docs/source/images/lambda_tutorial/2b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/2b.png -------------------------------------------------------------------------------- /docs/source/images/lambda_tutorial/3a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/3a.png -------------------------------------------------------------------------------- /docs/source/images/lambda_tutorial/3c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/3c.png -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Textractor Documentation 2 | ======================== 3 | 4 | .. image:: textractor_cropped.png 5 | :alt: Textractor 6 | 7 | **Textractor** is a python package created to seamlessly work with 4 popular `Amazon Textract `_ 8 | APIs. These are the DocumentTextDetection, StartDocumentTextDetection, AnalyzeDocument and StartDocumentAnalysis endpoints. The package contains utilities to call Textract services, 9 | convert JSON responses from API calls to programmable objects, visualize entities on the document and export document data is compatible formats. 10 | It is intended to aid Textract customers in setting up their post-processing pipelines. 11 | 12 | Previous work in this space has been made available in the following packages: 13 | 14 | 1. `amazon-textract-caller `_ (to call textract without the explicit use of boto3) 15 | 16 | 2. `amazon-textract-response-parser `_ (to parse the JSON response returned by Textract APIs) 17 | 18 | 3. `amazon-textract-overlayer `_ (to draw bounding boxes around the document entities on the document image) 19 | 20 | 4. `amazon-textract-prettyprinter `_ (to string represent document entities) 21 | 22 | 5. `amazon-textract-directional_finder `_ (to perform geometric search on the document) 23 | 24 | 25 | The `amazon-textract-caller `_ has been used as a dependency within this package 26 | with a wrapper around it to reduce the number of parameters the customer needs to pass. Additionally, newer input formats for the 27 | document have been provisioned with this package. 28 | 29 | The remaining packages have been refactored within this new package but the prominent functionalities are all made available to not disrupt 30 | the requirements of the customer. 31 | 32 | This package also hosts newer features that haven't previously been implemented in existing packages. These include: 33 | 34 | a. Semantic Document Search 35 | 36 | b. Query for key-values using keys 37 | 38 | c. Table access with numpy indexing 39 | 40 | d. New export formats with excel, csv and txt 41 | 42 | e. Indication of duplicated document entities 43 | 44 | f. Availability of all the above at :class:`Document` and :class:`Page` level. 45 | 46 | 47 | .. toctree:: 48 | :maxdepth: 4 49 | 50 | Usage 51 | ===== 52 | .. toctree:: 53 | :maxdepth: 2 54 | 55 | installation 56 | using_in_lambda 57 | examples 58 | commandline 59 | 60 | API Reference 61 | ============= 62 | 63 | .. toctree:: 64 | :maxdepth: 4 65 | 66 | textractor 67 | textractor.parsers 68 | textractor.entities 69 | textractor.visualizers 70 | textractor.data.constants 71 | textractor.data.text_linearization_config 72 | 73 | -------------------------------------------------------------------------------- /docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | Official package 8 | ____________________________________ 9 | 10 | Textractor is available on PyPI and can be installed with :code:`pip install amazon-textract-textractor`. By default this will install the minimal version of textractor. The following extras can be used to add features: 11 | 12 | - :code:`pdfium` (:code:`pip install amazon-textract-textractor[pdfium]`) includes :code:`pypdfium2` and is the recommended way to enable PDF rasterization in Textractor. Note that this is **not** necessary to call Textract with a PDF file. 13 | - :code:`pdf` (:code:`pip install amazon-textract-textractor[pdf]`) includes :code:`pdf2image` and is an additional way to enable PDF rasterization in Textractor. Note that this is **not** necessary to call Textract with a PDF file. 14 | - :code:`torch` (:code:`pip install amazon-textract-textractor[torch]`) includes :code:`sentence_transformers` for better word search and matching. This will work on CPU but be noticeably slower than non-machine learning based approaches. 15 | - :code:`dev` (:code:`pip install amazon-textract-textractor[dev]`) includes all the dependencies above and everything else needed to test the code. 16 | 17 | You can pick several extras by separating the labels with commas like this :code:`pip install amazon-textract-textractor[pdf,torch]`. 18 | 19 | From Source 20 | ___________ 21 | 22 | To install the package, clone the repository with the following command - 23 | 24 | :code:`git clone git@github.com:aws-samples/amazon-textract-textractor.git` 25 | 26 | Navigate into the amazon-textract-textractor directory on the terminal and run these commands. 27 | 28 | To install requirements :code:`pip install -r requirements.txt` 29 | 30 | Then install the package with :code:`pip install -e .` 31 | 32 | Try it out 33 | ___________ 34 | 35 | The :file:`Demo.ipynb` can be used as a reference to understand some functionalities hosted by the package. 36 | Additionally, `docs/tests/notebooks/` have some tutorials you can try out. -------------------------------------------------------------------------------- /docs/source/notebooks/imgs/excel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/notebooks/imgs/excel.png -------------------------------------------------------------------------------- /docs/source/notebooks/interfacing_with_trp2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f3801162", 6 | "metadata": {}, 7 | "source": [ 8 | "# Interfacing with trp2\n", 9 | "\n", 10 | "The Textract response parser was the preferred way of handling Textract API output before the release of Textractor. If your current workflow uses the older library, you can easily reuse their functions through the compatibility API.\n", 11 | "\n", 12 | "## Installation\n", 13 | "\n", 14 | "To begin, install the `amazon-textract-textractor` package using pip.\n", 15 | "\n", 16 | "`pip install amazon-textract-textractor`\n", 17 | "\n", 18 | "There are various sets of dependencies available to tailor your installation to your use case. The base package will have sensible default, but you may want to install the PDF extra dependencies if your workflow uses PDFs with `pip install amazon-textract-textractor[pdfium]`. You can read more on extra dependencies [in the documentation](https://aws-samples.github.io/amazon-textract-textractor/installation.html)\n", 19 | "\n", 20 | "## Calling Textract" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "id": "47ea794e", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from textractor import Textractor\n", 31 | "\n", 32 | "extractor = Textractor(profile_name=\"default\")\n", 33 | "# This path assumes that you are running the notebook from docs/source/notebooks\n", 34 | "document = extractor.detect_document_text(\"../../../tests/fixtures/form.png\")" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "id": "7231472c", 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "This document holds the following data:\n", 47 | "Pages - 1\n", 48 | "Words - 259\n", 49 | "Lines - 74\n", 50 | "Key-values - 0\n", 51 | "Checkboxes - 0\n", 52 | "Tables - 0\n", 53 | "Identity Documents - 0" 54 | ] 55 | }, 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "document" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "14b4052c", 68 | "metadata": {}, 69 | "source": [ 70 | "## Getting the trp2 document\n", 71 | "\n", 72 | "All `Document` objects have a convenience function `to_trp2()` that is a shorthand for `TDocumentSchema().load(document.response)` and creates a matching trp2 document. Note that this behaves as a converter, not as a proxy so any changes done on the `TDocument` will not be passed to the `Document` object." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "id": "a9b36794", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "trp2_document = document.to_trp2()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "57e69a22", 88 | "metadata": {}, 89 | "source": [ 90 | "## Conclusion\n", 91 | "\n", 92 | "Textractor comes with everything you need to reuse components from your current workflow with the newer caller, pretty printer, or directional finder." 93 | ] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": "Python 3 (ipykernel)", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.10.6" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 5 117 | } 118 | -------------------------------------------------------------------------------- /docs/source/overlayer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/overlayer.png -------------------------------------------------------------------------------- /docs/source/overlayer_bigger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/overlayer_bigger.png -------------------------------------------------------------------------------- /docs/source/textractor.data.constants.rst: -------------------------------------------------------------------------------- 1 | Constants 2 | ======================= 3 | 4 | 5 | .. automodule:: textractor.data.constants 6 | :members: 7 | :undoc-members: 8 | :show-inheritance: 9 | -------------------------------------------------------------------------------- /docs/source/textractor.data.text_linearization_config.rst: -------------------------------------------------------------------------------- 1 | TextLinearizationConfig 2 | ======================= 3 | 4 | .. automodule:: textractor.data.text_linearization_config 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/textractor.parsers.rst: -------------------------------------------------------------------------------- 1 | Entity Parser 2 | ========================== 3 | 4 | The library is intended to support multiple formats for parsing with a unified underlying object representation. For the Textract customer, 5 | the response_parser function has been created to handle API response parsing for `DetectDocumentText `_, 6 | `AnalyzeDocument `_, `StartDocumentTextDetection `_ and 7 | `StartDocumentAnalysis `_. 8 | 9 | response_parser 10 | --------------- 11 | 12 | .. automodule:: textractor.parsers.response_parser 13 | :members: 14 | :undoc-members: 15 | :show-inheritance: 16 | -------------------------------------------------------------------------------- /docs/source/textractor.rst: -------------------------------------------------------------------------------- 1 | Textract Caller 2 | =============== 3 | 4 | .. automodule:: textractor.textractor 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/textractor.visualizers.rst: -------------------------------------------------------------------------------- 1 | Entity Visualization 2 | ===================== 3 | 4 | Most features that return :class:`DocumentEntity` objects are of :class:`EntityList` type. It is an extension of the :code:`list` data type 5 | with the intention of providing visualization features to these entities. 6 | 7 | EntityList 8 | ---------- 9 | 10 | .. automodule:: textractor.visualizers.entitylist 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | -------------------------------------------------------------------------------- /docs/source/textractor_cropped.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/textractor_cropped.png -------------------------------------------------------------------------------- /docs/source/using_in_lambda.rst: -------------------------------------------------------------------------------- 1 | Using Textractor in AWS Lambda 2 | ============ 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | Textractor uses Pillow for image manipulation which is a compiled dependency (i.e. not pure Python). 8 | While we encourage you to build your own lambda layers, we received several requests mentioning that the process tedious, 9 | which is why we also offer precompiled layers as zip files that you can directly upload to lambda. 10 | 11 | The precompiled layers are rebuilt on release and can be downloaded here https://github.com/aws-samples/amazon-textract-textractor/actions/workflows/lambda_layers.yml. 12 | 13 | Step-by-step 14 | ------------ 15 | 16 | We provide a step by step through the AWS Console, but note that proceeding with the AWS CLI would also work. For brevity we assume that you already have an existing lambda. 17 | You can find an excellent guide on how to create a lambda function here: 18 | https://docs.aws.amazon.com/lambda/latest/dg/getting-started.html. Note that your lambda function will need 19 | to have Textract access. Since we are targeting a wide range of use cases we will use the AmazonTextractFullAccess 20 | policy. We recommend that you review your lambda function and tailor the permission to your specific use case. 21 | 22 | 1. Download the precompiled layers from the GitHub Actions workflow. https://github.com/aws-samples/amazon-textract-textractor/actions/workflows/lambda_layers.yml 23 | 24 | a. Navigate to the page 25 | 26 | b. Click on "Lambda Layers" 27 | 28 | .. image:: images/lambda_tutorial/1b.png 29 | 30 | c. Scroll to the bottom of the page and download the package that matches your Python installation. Packages with the `-pdfium` suffix contain `pypdfium2` and allow you to process PDF documents. Packages with the `-pdf` suffix contain `pdf2image` and also allow you to process PDF documents, however we recommend using `pypdfium2` as it does not require any OS-level dependencies. 31 | 32 | .. image:: images/lambda_tutorial/1c.png 33 | 34 | 2. In your AWS Console, navigate to "Lambda" and click "Layers" in the sidebar to the left. 35 | 36 | .. image:: images/lambda_tutorial/2.png 37 | 38 | a. Click "Create layer" 39 | 40 | .. image:: images/lambda_tutorial/2a.png 41 | 42 | b. Fill-in the form and upload the .zip file you downloaded in step 1. 43 | 44 | .. image:: images/lambda_tutorial/2b.png 45 | 46 | c. Click "Create" 47 | 48 | 3. Navigate to your lambda 49 | 50 | a. Scroll down and click "Add a layer" 51 | 52 | .. image:: images/lambda_tutorial/3a.png 53 | 54 | b. Choose "Custom layers" and pick your amazon-textract-textractor layer 55 | 56 | c. Click "Add" 57 | 58 | .. image:: images/lambda_tutorial/3c.png 59 | 60 | 4. Update your code to use Textractor 61 | 62 | a. If using the `pdf2image` PDF version you have to update the `PATH` and `LD_LIBRARY_PATH` environment variables through the lambda function configuration interface or directly in code with the `os` module: 63 | 64 | .. code-block:: python 65 | 66 | os.environ["LD_LIBRARY_PATH"] = f"/opt/python/bin/:{os.environ['LD_LIBRARY_PATH']}" 67 | os.environ["PATH"] = f"/opt/python/bin/:{os.environ['PATH']}" -------------------------------------------------------------------------------- /extras/dev.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | pandas 3 | pdf2image>=1.16,<1.17 4 | pytest 5 | lxml 6 | sentence-transformers>=2.2,<2.3 7 | sphinx-rtd-theme>=1.0,<1.1 -------------------------------------------------------------------------------- /extras/docs.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | pandas 3 | pdf2image>=1.16,<1.17 4 | pytest 5 | Sphinx>=5.1,<5.2 6 | nbsphinx>=0.8,<0.9 7 | sphinx-rtd-theme>=2.0,<3.0 8 | sphinx-argparse>=0.5.1 9 | sphinxcontrib-applehelp>=1.0,<1.1 10 | sphinxcontrib-devhelp>=1.0,<1.1 11 | sphinxcontrib-htmlhelp>=2.0,<2.1 12 | sphinxcontrib-jsmath>=1.0,<1.1 13 | sphinxcontrib-qthelp>=1.0,<1.1 14 | sphinxcontrib-serializinghtml>=1.1,<1.2 15 | -------------------------------------------------------------------------------- /extras/pandas.txt: -------------------------------------------------------------------------------- 1 | pandas -------------------------------------------------------------------------------- /extras/pdf.txt: -------------------------------------------------------------------------------- 1 | pdf2image>=1.16,<1.17 -------------------------------------------------------------------------------- /extras/pdfium.txt: -------------------------------------------------------------------------------- 1 | pypdfium2 -------------------------------------------------------------------------------- /extras/torch.txt: -------------------------------------------------------------------------------- 1 | sentence-transformers>=2.2,<2.3 -------------------------------------------------------------------------------- /helper/Manifest.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | 3 | recursive-exclude * __pycache__ 4 | recursive-exclude * *.py[co] test_*.py 5 | recursive-include textracthelper/examples * 6 | 7 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif -------------------------------------------------------------------------------- /helper/docs/employmentapp_boxed_FORM_CELL_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/docs/employmentapp_boxed_FORM_CELL_.png -------------------------------------------------------------------------------- /helper/docs/employmentapp_boxed_LINE_TEXT_OVERLAY.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/docs/employmentapp_boxed_LINE_TEXT_OVERLAY.png -------------------------------------------------------------------------------- /helper/docs/employmentapp_boxed_WORD_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/docs/employmentapp_boxed_WORD_.png -------------------------------------------------------------------------------- /helper/docs/employmentapp_boxed_WORD_TEXT_OVERLAY.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/docs/employmentapp_boxed_WORD_TEXT_OVERLAY.png -------------------------------------------------------------------------------- /helper/fonts/Roboto-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/fonts/Roboto-Regular.ttf -------------------------------------------------------------------------------- /helper/setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.0.35 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:textracthelper/_version.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | -------------------------------------------------------------------------------- /helper/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, find_packages 4 | from setuptools.command.install import install 5 | 6 | 7 | def read(fname): 8 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 9 | 10 | 11 | class FontInstaller(install): 12 | 13 | def run(self): 14 | self._copy_fonts() 15 | install.run(self) 16 | 17 | def _copy_fonts(self): 18 | try: 19 | import shutil 20 | 21 | if sys.platform == "win32": 22 | # check the windows font repository 23 | # NOTE: must use uppercase WINDIR, to work around bugs in 24 | # 1.5.2's os.environ.get() 25 | windir = os.environ.get("WINDIR") 26 | if windir: 27 | tgt_dir = os.path.join(windir, "fonts") 28 | elif sys.platform in ("linux", "linux2"): 29 | lindirs = os.environ.get("XDG_DATA_DIRS", "") 30 | if not lindirs: 31 | # According to the freedesktop spec, XDG_DATA_DIRS should 32 | # default to /usr/share 33 | tgt_dir = "/usr/share/fonts" 34 | else: 35 | lindir = lindirs.split(":")[0] 36 | tgt_dir = os.path.join(lindir, "fonts") 37 | elif sys.platform == "darwin": 38 | tgt_dir = os.path.expanduser("~/Library/Fonts") 39 | 40 | if not os.path.isdir(tgt_dir): 41 | print('WARNING: Could not locate fonts directory. Default font will be used') 42 | else: 43 | _src_dir = 'fonts/' 44 | _font_file = 'Roboto-Regular.ttf' 45 | 46 | if _font_file not in os.listdir(tgt_dir): 47 | shutil.copyfile(os.path.join(_src_dir, _font_file), os.path.join(tgt_dir, _font_file)) 48 | 49 | except: 50 | print('WARNING: An issue occurred while installing the custom fonts. Default font will be used') 51 | 52 | 53 | requirements = [ 54 | 'boto3', 'botocore', 'amazon-textract-response-parser>=0.1.40', 'amazon-textract-caller>=0.0.27', 55 | 'amazon-textract-overlayer>=0.0.10', 'amazon-textract-prettyprinter>=0.1.0', 'Pillow', 'pypdf>=3.1,<4.0' 56 | ] 57 | 58 | if sys.argv[-1] == 'publish-test': 59 | os.system(f"cd {os.path.dirname(__file__)}") 60 | os.system('rm -rf dist/ build/ amazon_textract_helper.egg-info/') 61 | os.system('python setup.py sdist bdist_wheel') 62 | os.system('twine check dist/*') 63 | os.system('twine upload --repository pypitest dist/*') 64 | sys.exit() 65 | 66 | if sys.argv[-1] == 'publish': 67 | os.system(f"cd {os.path.dirname(__file__)}") 68 | os.system('rm -rf dist/ build/ amazon_textract_helper.egg-info/') 69 | os.system('python setup.py sdist bdist_wheel') 70 | os.system('twine check dist/*') 71 | os.system('twine upload --repository pypi dist/*') 72 | sys.exit() 73 | 74 | setup(name='amazon-textract-helper', 75 | packages=find_packages(exclude=['tests']), 76 | include_package_data=True, 77 | exclude_package_data={"": ["test_*.py", "__pycache__"]}, 78 | version='0.0.35', 79 | description='Amazon Textract Helper tools', 80 | install_requires=requirements, 81 | scripts=['bin/amazon-textract'], 82 | long_description_content_type='text/markdown', 83 | long_description=read('README.md'), 84 | author='Amazon Rekognition Textract Demoes', 85 | author_email='rekognition-textract-demos@amazon.com', 86 | url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/helper', 87 | keywords='amazon-textract-textractor amazon textract textractor helper', 88 | license="Apache License Version 2.0", 89 | classifiers=[ 90 | "Development Status :: 4 - Beta", 91 | "Topic :: Utilities", 92 | 'License :: OSI Approved :: Apache Software License', 93 | 'Programming Language :: Python :: 3.6', 94 | 'Programming Language :: Python :: 3.7', 95 | 'Programming Language :: Python :: 3.8', 96 | 'Programming Language :: Python :: 3.9', 97 | 'Programming Language :: Python :: 3.10', 98 | ], 99 | cmdclass={'install': FontInstaller}, 100 | python_requires='>=3.6') 101 | -------------------------------------------------------------------------------- /helper/textracthelper/.gitignore: -------------------------------------------------------------------------------- 1 | test* -------------------------------------------------------------------------------- /helper/textracthelper/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | 3 | import logging 4 | from logging import NullHandler 5 | 6 | logging.getLogger(__name__).addHandler(NullHandler()) 7 | -------------------------------------------------------------------------------- /helper/textracthelper/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.35' 2 | -------------------------------------------------------------------------------- /helper/textracthelper/examples/employmentapp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/textracthelper/examples/employmentapp.png -------------------------------------------------------------------------------- /idp_cdk_manifest/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | .idea/* 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # dotenv 88 | .env 89 | 90 | # virtualenv 91 | .venv 92 | venv/ 93 | ENV/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # remove test with fixed input-document locations on S3 till we add a Textract and S3 mock 109 | test_local* 110 | 111 | share/python-wheels/ 112 | *.egg-info/ 113 | .installed.cfg 114 | *.egg 115 | MANIFEST 116 | .# 117 | 118 | .dir-locals.el 119 | 120 | .vscode 121 | 122 | .envrc 123 | -------------------------------------------------------------------------------- /idp_cdk_manifest/Manifest.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | 3 | recursive-exclude * __pycache__ 4 | recursive-exclude * *.py[co] test_*.py 5 | 6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif -------------------------------------------------------------------------------- /idp_cdk_manifest/README.md: -------------------------------------------------------------------------------- 1 | just bla for now 2 | -------------------------------------------------------------------------------- /idp_cdk_manifest/output.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/idp_cdk_manifest/output.tar.gz -------------------------------------------------------------------------------- /idp_cdk_manifest/setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.0.1 3 | commit = False 4 | tag = False 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:textractmanifest/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | -------------------------------------------------------------------------------- /idp_cdk_manifest/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | 10 | requirements = ['marshmallow'] 11 | 12 | if sys.argv[-1] == 'publish-test': 13 | os.system(f"cd {os.path.dirname(__file__)}") 14 | os.system('rm -rf dist/ build/ idp-cdk-manifest.egg-info/') 15 | os.system('python setup.py sdist bdist_wheel') 16 | os.system('twine check dist/*') 17 | os.system('twine upload --repository pypitest dist/*') 18 | sys.exit() 19 | 20 | if sys.argv[-1] == 'publish': 21 | os.system(f"cd {os.path.dirname(__file__)}") 22 | os.system('rm -rf dist/ build/ idp-cdk-manifest.egg-info/') 23 | os.system('python setup.py sdist bdist_wheel') 24 | os.system('twine check dist/*') 25 | os.system('twine upload --repository pypi dist/*') 26 | sys.exit() 27 | 28 | setup(name='amazon-textract-idp-cdk-manifest', 29 | packages=find_packages(exclude=['tests']), 30 | include_package_data=True, 31 | exclude_package_data={"": ["test_*.py", "__pycache__"]}, 32 | version='0.0.2', 33 | description='Amazon Textract IDP CDK Manifest', 34 | install_requires=requirements, 35 | extras_require={'testing': ['pytest']}, 36 | long_description_content_type='text/markdown', 37 | long_description=read('README.md'), 38 | author='Amazon Rekognition Textract Demoes', 39 | author_email='rekognition-textract-demos@amazon.com', 40 | url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/idp_cdk_manifest', 41 | keywords='textract manifest', 42 | license="Apache License Version 2.0", 43 | classifiers=[ 44 | "Development Status :: 4 - Beta", 45 | "Topic :: Utilities", 46 | 'License :: OSI Approved :: Apache Software License', 47 | 'Programming Language :: Python :: 3.7', 48 | 'Programming Language :: Python :: 3.8', 49 | 'Programming Language :: Python :: 3.9', 50 | 'Programming Language :: Python :: 3.10', 51 | 'Programming Language :: Python :: 3.11', 52 | 'Programming Language :: Python :: 3.12', 53 | ], 54 | python_requires='>=3.7') 55 | -------------------------------------------------------------------------------- /idp_cdk_manifest/tests/data/analyze_id.json: -------------------------------------------------------------------------------- 1 | { 2 | "documentPages":[ 3 | "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 4 | "s3://amazon-textract-public-content/blogs/employeeapp20210510.png" 5 | ], 6 | "classification": "ID_DOCUMENT" 7 | } 8 | -------------------------------------------------------------------------------- /idp_cdk_manifest/tests/data/manifest_all_features.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES", 7 | "SIGNATURE" 8 | ], 9 | "queriesConfig": [{ 10 | "text": "What is the applicant full name?", 11 | "alias": "FULL_NAME", 12 | "pages": ["*"] 13 | }], 14 | "classification": "EMPLOYMENT_APPLICATION", 15 | "metaData": [{ 16 | "key": "meta_data_key_1", 17 | "value": "meta_data_value_1" 18 | },{ 19 | "key": "meta_data_key_2", 20 | "value": "meta_data_value_2" 21 | }] 22 | } 23 | -------------------------------------------------------------------------------- /idp_cdk_manifest/tests/data/manifest_default.json: -------------------------------------------------------------------------------- 1 | { 2 | "textractFeatures": [ 3 | "QUERIES" 4 | ], 5 | "queriesConfig": [{ 6 | "text": "What is the applicant full name?", 7 | "alias": "FULL_NAME", 8 | "pages": ["*"] 9 | }] 10 | } 11 | -------------------------------------------------------------------------------- /idp_cdk_manifest/tests/data/manifest_minimal.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png" 3 | } 4 | -------------------------------------------------------------------------------- /idp_cdk_manifest/tests/data/manifest_queries_no_alias.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "QUERIES" 5 | ], 6 | "queriesConfig": [{ 7 | "text": "What is the applicant full name?" 8 | }, 9 | { 10 | "text": "What is the applicant last name?" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /idp_cdk_manifest/tests/data/manifest_queries_no_pages.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "QUERIES" 5 | ], 6 | "queriesConfig": [{ 7 | "text": "What is the applicant full name?", 8 | "alias": "FULL_NAME" 9 | }, 10 | { 11 | "text": "What is the applicant last name?", 12 | "alias": "LAST_NAME" 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /idp_cdk_manifest/tests/data/manifest_with_classification.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queriesConfig": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": ["*"] 12 | }], 13 | "classification":"EMPLOYMENT_APPLICATION" 14 | } 15 | -------------------------------------------------------------------------------- /idp_cdk_manifest/tests/data/manifest_with_classification_and_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queriesConfig": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": ["*"] 12 | }], 13 | "classification":"EMPLOYMENT_APPLICATION", 14 | "metaData":[ 15 | {"key": "key1", "value": "value1"}, 16 | {"key": "key2", "value": "value2"} 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /idp_cdk_manifest/tests/data/queries_forms.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", "QUERIES" 5 | ], 6 | "queriesConfig": [{ 7 | "text": "What is the applicant full name?", 8 | "alias": "FULL_NAME" 9 | }, 10 | { 11 | "text": "What is the applicant last name?", 12 | "alias": "LAST_NAME" 13 | } 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /idp_cdk_manifest/tests/data/simple_feature_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queriesConfig": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": ["*"] 12 | }] 13 | } 14 | -------------------------------------------------------------------------------- /idp_cdk_manifest/textractmanifest/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging import NullHandler 3 | from .manifest import IDPManifest as IDPManifest, IDPManifestSchema as IDPManifestSchema, Query as Query, QuerySchema as QuerySchema, MetaData as MetaData, MetaDataSchema as MetaDataSchema 4 | 5 | logging.getLogger('tidpmanifest').addHandler(NullHandler()) 6 | 7 | __version__ = '0.0.1' 8 | -------------------------------------------------------------------------------- /idp_cdk_manifest/textractmanifest/manifest.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | import marshmallow as m 3 | import logging 4 | from typing import List 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class BaseSchema(m.Schema): 10 | """ 11 | skip null values when generating JSON 12 | https://github.com/marshmallow-code/marshmallow/issues/229#issuecomment-134387999 13 | """ 14 | SKIP_VALUES = set([None]) 15 | 16 | @m.post_dump 17 | def remove_skip_values(self, data, many, pass_many=False): 18 | return { 19 | key: value 20 | for key, value in data.items() 21 | if isinstance(value, (dict, list, set, tuple, range, 22 | frozenset)) or value not in self.SKIP_VALUES 23 | } 24 | 25 | 26 | @dataclass 27 | class MetaData(): 28 | key: str 29 | value: str 30 | 31 | 32 | @dataclass 33 | class Query(): 34 | text: str 35 | alias: str = field(default=None) #type: ignore 36 | pages: List[str] = field(default=None) #type: ignore 37 | 38 | 39 | @dataclass 40 | class IDPManifest(): 41 | s3_path: str = field(default=None) #type: ignore 42 | document_pages: List[str] = field(default=None) #type: ignore 43 | queries_config: List[Query] = field(default=None) #type: ignore 44 | textract_features: List[str] = field(default=None) #type: ignore 45 | classification: str = field(default=None) #type: ignore 46 | meta_data: List[MetaData] = field(default=None) #type: ignore 47 | 48 | def merge(self, manifest: 'IDPManifest'): 49 | ''' add values top level from the passed in manifest when not defined in the manifest itself. 50 | TODO: implement proper merging with joining arrays for example''' 51 | if manifest.s3_path and not self.s3_path: 52 | self.s3_path = manifest.s3_path 53 | if manifest.document_pages and not self.document_pages: 54 | self.document_pages = manifest.document_pages 55 | if manifest.queries_config and not self.queries_config: 56 | self.queries_config = manifest.queries_config 57 | if manifest.textract_features and not self.textract_features: 58 | self.textract_features = manifest.textract_features 59 | if manifest.meta_data and not self.meta_data: 60 | self.meta_data = manifest.meta_data 61 | 62 | 63 | class MetaDataSchema(BaseSchema): 64 | key = m.fields.String(data_key="key", required=True) 65 | value = m.fields.String(data_key="value", required=False) 66 | 67 | @m.post_load 68 | def make_query(self, data, **kwargs): 69 | return MetaData(**data) 70 | 71 | 72 | class QuerySchema(BaseSchema): 73 | text = m.fields.String(data_key="text", required=True) 74 | alias = m.fields.String(data_key="alias", required=False) 75 | pages = m.fields.List(m.fields.String, data_key="pages", required=False) 76 | 77 | @m.post_load 78 | def make_query(self, data, **kwargs): 79 | return Query(**data) 80 | 81 | 82 | class IDPManifestSchema(BaseSchema): 83 | queries_config = m.fields.List(m.fields.Nested(QuerySchema), 84 | data_key="queriesConfig", 85 | required=False) 86 | textract_features = m.fields.List(m.fields.String, 87 | data_key="textractFeatures", 88 | required=False) 89 | s3_path = m.fields.String(data_key="s3Path", required=False) 90 | classification = m.fields.String(data_key="classification", required=False) 91 | document_pages = m.fields.List(m.fields.String, 92 | data_key="documentPages", 93 | required=False) 94 | meta_data = m.fields.List(m.fields.Nested(MetaDataSchema), 95 | data_key="metaData", 96 | required=False) 97 | 98 | @m.post_load 99 | def make_queries_config(self, data, **kwargs): 100 | return IDPManifest(**data) 101 | -------------------------------------------------------------------------------- /images/amzn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/images/amzn.png -------------------------------------------------------------------------------- /overlayer/Manifest.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | 3 | recursive-exclude * __pycache__ 4 | recursive-exclude * *.py[co] test_*.py 5 | 6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif -------------------------------------------------------------------------------- /overlayer/README.md: -------------------------------------------------------------------------------- 1 | # Textract-Overlayer 2 | 3 | amazon-textract-overlayer provides functions to help overlay bounding boxes on documents. 4 | 5 | # Install 6 | 7 | ```bash 8 | > python -m pip install amazon-textract-overlayer 9 | ``` 10 | 11 | Make sure your environment is setup with AWS credentials through configuration files or environment variables or an attached role. (https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) 12 | 13 | # Samples 14 | 15 | Primary method provided is get_bounding_boxes which returns bounding boxes based on the Textract_Type passed in. 16 | Mostly taken from the ```amazon-textract``` command from the package ```amazon-textract-helper```. 17 | 18 | This will return the bounding boxes for WORD and CELL data types. 19 | 20 | ```python 21 | from textractoverlayer.t_overlay import DocumentDimensions, get_bounding_boxes 22 | from textractcaller.t_call import Textract_Features, Textract_Types, call_textract 23 | 24 | doc = call_textract(input_document=input_document, features=features) 25 | # image is a PIL.Image.Image in this case 26 | document_dimension:DocumentDimensions = DocumentDimensions(doc_width=image.size[0], doc_height=image.size[1]) 27 | overlay=[Textract_Types.WORD, Textract_Types.CELL] 28 | 29 | bounding_box_list = get_bounding_boxes(textract_json=doc, document_dimensions=document_dimension, overlay_features=overlay) 30 | ``` 31 | 32 | The actual overlay drawing of bounding boxes for images is in the ```amazon-textract``` command from the package ```amazon-textract-helper``` and looks like this: 33 | 34 | ```python 35 | from PIL import Image, ImageDraw 36 | 37 | image = Image.open(input_document) 38 | rgb_im = image.convert('RGB') 39 | draw = ImageDraw.Draw(rgb_im) 40 | 41 | # check the impl in amazon-textract-helper for ways to associate different colors to types 42 | for bbox in bounding_box_list: 43 | draw.rectangle(xy=[bbox.xmin, bbox.ymin, bbox.xmax, bbox.ymax], outline=(128, 128, 0), width=2) 44 | 45 | rgb_im.show() 46 | ``` 47 | 48 | The draw bounding boxes within PDF documents the following code can be used: 49 | 50 | ```python 51 | import fitz 52 | 53 | # for local stored files 54 | file_path = "<>" 55 | doc = fitz.open(file_path) 56 | # for files stored in S3 the streaming object can be used 57 | # doc = fitz.open(stream="<>", filetype="pdf") 58 | 59 | # draw boxes 60 | for p, page in enumerate(doc): 61 | p += 1 62 | for bbox in bounding_box_list: 63 | if bbox.page_number == p: 64 | page.draw_rect( 65 | [bbox.xmin, bbox.ymin, bbox.xmax, bbox.ymax], color=(0, 1, 0), width=2 66 | ) 67 | 68 | # save file locally 69 | doc.save("<>") 70 | 71 | ``` -------------------------------------------------------------------------------- /overlayer/setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.0.12 3 | commit = False 4 | tag = False 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:textractoverlayer/_version.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | -------------------------------------------------------------------------------- /overlayer/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | 10 | requirements = ['boto3', 'botocore', 'amazon-textract-caller>=0.0.11', 'Pillow', 'pypdf>=3.1,<5.0'] 11 | 12 | if sys.argv[-1] == 'publish-test': 13 | os.system(f"cd {os.path.dirname(__file__)}") 14 | os.system('rm -rf dist/ build/ amazon_textract_overlayer.egg-info/') 15 | os.system('python setup.py sdist bdist_wheel') 16 | os.system('twine check dist/*') 17 | os.system('twine upload --repository pypitest dist/*') 18 | sys.exit() 19 | 20 | if sys.argv[-1] == 'publish': 21 | os.system(f"cd {os.path.dirname(__file__)}") 22 | os.system('rm -rf dist/ build/ amazon_textract_overlayer.egg-info/') 23 | os.system('python setup.py sdist bdist_wheel') 24 | os.system('twine check dist/*') 25 | os.system('twine upload --repository pypi dist/*') 26 | sys.exit() 27 | 28 | setup(name='amazon-textract-overlayer', 29 | packages=find_packages(exclude=['tests']), 30 | include_package_data=True, 31 | exclude_package_data={"": ["test_*.py", "__pycache__"]}, 32 | version='0.0.13', 33 | description='Amazon Textract Overlay tools', 34 | install_requires=requirements, 35 | long_description_content_type='text/markdown', 36 | long_description=read('README.md'), 37 | author='Amazon Rekognition Textract Demoes', 38 | author_email='rekognition-textract-demos@amazon.com', 39 | url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/overlayer', 40 | keywords='amazon-textract-textractor amazon textract textractor helper overlayer', 41 | license="Apache License Version 2.0", 42 | classifiers=[ 43 | "Development Status :: 4 - Beta", 44 | "Topic :: Utilities", 45 | 'License :: OSI Approved :: Apache Software License', 46 | 'Programming Language :: Python :: 3.6', 47 | 'Programming Language :: Python :: 3.7', 48 | 'Programming Language :: Python :: 3.8', 49 | 'Programming Language :: Python :: 3.9', 50 | 'Programming Language :: Python :: 3.10', 51 | 'Programming Language :: Python :: 3.11', 52 | 'Programming Language :: Python :: 3.12', 53 | ], 54 | python_requires='>=3.6') 55 | -------------------------------------------------------------------------------- /overlayer/tests/data/Amazon-Textract-Pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/overlayer/tests/data/Amazon-Textract-Pdf.pdf -------------------------------------------------------------------------------- /overlayer/tests/test_overlayer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import textractoverlayer.image_tools as it 4 | 5 | 6 | def test_overlayer_pdf_dimensions(caplog): 7 | caplog.set_level(logging.DEBUG) 8 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 9 | input_filename = os.path.join(SCRIPT_DIR, "data/Amazon-Textract-Pdf.pdf") 10 | dimensions = it.get_width_height_from_file(input_filename) 11 | assert dimensions.doc_height == 792 12 | assert dimensions.doc_width == 612 13 | -------------------------------------------------------------------------------- /overlayer/textractoverlayer/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | 3 | import logging 4 | from logging import NullHandler 5 | 6 | logging.getLogger(__name__).addHandler(NullHandler()) 7 | -------------------------------------------------------------------------------- /overlayer/textractoverlayer/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.12' 2 | 3 | -------------------------------------------------------------------------------- /overlayer/textractoverlayer/image_tools.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import sys 4 | from textractoverlayer.t_overlay import DocumentDimensions 5 | import boto3 6 | 7 | # Conditionally add /opt to the PYTHON PATH 8 | if os.getenv('AWS_EXECUTION_ENV') is not None: 9 | sys.path.append('/opt') 10 | 11 | from PIL import Image 12 | from pypdf import PdfReader 13 | 14 | pdf_suffixes = ['.pdf'] 15 | image_suffixes = ['.png', '.jpg', '.jpeg'] 16 | supported_suffixes = pdf_suffixes + image_suffixes 17 | 18 | 19 | def get_size_from_filestream(fs, ext) -> DocumentDimensions: 20 | if ext in image_suffixes: 21 | img = Image.open(fs) 22 | return DocumentDimensions(doc_width=img.width, doc_height=img.height) 23 | else: 24 | input1 = PdfReader(fs) 25 | pdf_page = input1.pages[0].mediabox 26 | return DocumentDimensions(doc_width=int(pdf_page[2]), doc_height=int(pdf_page[3])) 27 | 28 | 29 | def get_size_from_s3(s3_bucket, s3_key) -> DocumentDimensions: 30 | _, ext = os.path.splitext(s3_key) 31 | if ext in supported_suffixes: 32 | s3 = boto3.client('s3') 33 | o = s3.get_object(Bucket=s3_bucket, Key=s3_key) 34 | input_bytes = o.get('Body').read() 35 | f = io.BytesIO(input_bytes) 36 | return get_size_from_filestream(f, ext) 37 | else: 38 | raise ValueError(f'{s3_key} not in {supported_suffixes}') 39 | 40 | 41 | def get_filename_from_document(input_document: str): 42 | file_name = '' 43 | if len(input_document) > 7 and input_document.lower().startswith('s3://'): 44 | input_document = input_document.replace('s3://', '') 45 | _, s3_key = input_document.split('/', 1) 46 | file_name, suffix = os.path.splitext(os.path.basename(s3_key)) 47 | else: 48 | file_name, suffix = os.path.splitext(os.path.basename(input_document)) 49 | return file_name, suffix 50 | 51 | 52 | def get_size_from_document(input_document: str) -> DocumentDimensions: 53 | if len(input_document) > 7 and input_document.lower().startswith('s3://'): 54 | input_document = input_document.replace('s3://', '') 55 | s3_bucket, s3_key = input_document.split('/', 1) 56 | return get_size_from_s3(s3_bucket=s3_bucket, s3_key=s3_key) 57 | else: 58 | return get_size_from_document(input_document) 59 | 60 | 61 | def get_width_height_from_s3_object(s3_bucket, s3_key) -> DocumentDimensions: 62 | return get_size_from_s3(s3_bucket, s3_key) 63 | 64 | 65 | def get_width_height_from_file(filepath) -> DocumentDimensions: 66 | _, ext = os.path.splitext(filepath) 67 | if ext in supported_suffixes: 68 | with open(filepath, 'rb') as input_fs: 69 | return get_size_from_filestream(input_fs, ext) 70 | else: 71 | raise ValueError(f'{filepath} not in {supported_suffixes}') 72 | 73 | 74 | if __name__ == '__main__': 75 | import argparse 76 | 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument('--s3-bucket', required=True) 79 | parser.add_argument('--s3-key', required=True) 80 | args = parser.parse_args() 81 | s3_bucket = args.s3_bucket 82 | s3_key = args.s3_key 83 | 84 | print(get_width_height_from_s3_object(s3_bucket, s3_key)) 85 | -------------------------------------------------------------------------------- /prettyprinter/Manifest.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | 3 | recursive-exclude * __pycache__ 4 | recursive-exclude * *.py[co] test_*.py 5 | 6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif -------------------------------------------------------------------------------- /prettyprinter/setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.1.9 3 | commit = False 4 | tag = False 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:textractprettyprinter/_version.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | -------------------------------------------------------------------------------- /prettyprinter/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup 4 | 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | 10 | requirements = ['boto3>=1,<2', 'botocore', 'amazon-textract-response-parser>=0.1,<0.2', 'tabulate>=0.9,<0.10'] 11 | 12 | if sys.argv[-1] == 'publish-test': 13 | os.system(f"cd {os.path.dirname(__file__)}") 14 | os.system('rm -rf dist/ build/ amazon_textract_prettyprinter.egg-info/') 15 | os.system('python setup.py sdist bdist_wheel') 16 | os.system('twine check dist/*') 17 | os.system('twine upload --repository pypitest dist/*') 18 | sys.exit() 19 | 20 | if sys.argv[-1] == 'publish': 21 | os.system(f"cd {os.path.dirname(__file__)}") 22 | os.system('rm -rf dist/ build/ amazon_textract_prettyprinter.egg-info/') 23 | os.system('python setup.py sdist bdist_wheel') 24 | os.system('twine check dist/*') 25 | os.system('twine upload --repository pypi dist/*') 26 | sys.exit() 27 | 28 | setup(name='amazon-textract-prettyprinter', 29 | packages=['textractprettyprinter'], 30 | version='0.1.10', 31 | description='Amazon Textract Helper tools for pretty printing', 32 | install_requires=requirements, 33 | long_description_content_type='text/markdown', 34 | long_description=read('README.md'), 35 | author='Amazon Rekognition Textract Demoes', 36 | author_email='rekognition-textract-demos@amazon.com', 37 | url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/prettyprinter', 38 | keywords='amazon-textract-textractor amazon textract textractor helper pretty-print', 39 | license="Apache License Version 2.0", 40 | classifiers=[ 41 | "Development Status :: 4 - Beta", 42 | "Topic :: Utilities", 43 | 'License :: OSI Approved :: Apache Software License', 44 | 'Programming Language :: Python :: 3.6', 45 | 'Programming Language :: Python :: 3.7', 46 | 'Programming Language :: Python :: 3.8', 47 | 'Programming Language :: Python :: 3.9', 48 | 'Programming Language :: Python :: 3.10', 49 | 'Programming Language :: Python :: 3.11', 50 | 'Programming Language :: Python :: 3.12', 51 | ], 52 | python_requires='>=3.6') 53 | -------------------------------------------------------------------------------- /prettyprinter/textractprettyprinter/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | 3 | from .t_pretty_print import Pretty_Print_Table_Format as Pretty_Print_Table_Format 4 | from .t_pretty_print_layout import get_layout_csv_from_trp2 as get_layout_csv_from_trp2 5 | import logging 6 | from logging import NullHandler 7 | 8 | logging.getLogger(__name__).addHandler(NullHandler()) 9 | -------------------------------------------------------------------------------- /prettyprinter/textractprettyprinter/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.9' 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | amazon-textract-caller>=0.2.4,<1 2 | Pillow 3 | tabulate>=0.9,<0.10 4 | XlsxWriter>=3.0,<4 5 | editdistance>=0.6.2,<0.9 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | long_description = file: 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import setuptools 5 | from setuptools import find_packages, setup 6 | from os import path 7 | 8 | here = path.abspath(path.dirname(__file__)) 9 | 10 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 11 | long_description = f.read() 12 | 13 | 14 | def read_requirements(path): 15 | with open(path, "r") as f: 16 | requirements = [line for line in f.readlines()] 17 | return requirements 18 | 19 | class TestCommand(setuptools.Command): 20 | 21 | description = 'run linters, tests and create a coverage report' 22 | user_options = [] 23 | 24 | def initialize_options(self): 25 | pass 26 | 27 | def finalize_options(self): 28 | pass 29 | 30 | def run(self): 31 | #self._run(['pytest', 'tests/']) 32 | return 33 | 34 | def _run(self, command): 35 | try: 36 | subprocess.check_call(command) 37 | except subprocess.CalledProcessError as error: 38 | print('Command failed with exit code', error.returncode) 39 | sys.exit(error.returncode) 40 | 41 | setup( 42 | # include data files 43 | name="amazon-textract-textractor", 44 | version="1.9.2", 45 | license="Apache 2.0", 46 | description="A package to use AWS Textract services.", 47 | url="https://github.com/aws-samples/amazon-textract-textractor", 48 | long_description=long_description, 49 | long_description_content_type="text/markdown", 50 | classifiers=[ 51 | "Programming Language :: Python :: 3", 52 | "Programming Language :: Python :: 3.9", 53 | "Programming Language :: Python :: 3.10", 54 | "Programming Language :: Python :: 3.11", 55 | "Programming Language :: Python :: 3.12", 56 | ], 57 | keywords="amazon textract aws ocr document", 58 | packages=find_packages(exclude=["docs", "tests"], ), 59 | include_package_data=True, 60 | install_requires=read_requirements(os.path.join(here, "requirements.txt")), 61 | extras_require={ 62 | f.split(".")[0]: read_requirements(os.path.join(here, "extras", f)) 63 | for f in os.listdir(os.path.join(here, "extras")) 64 | }, 65 | cmdclass={'test': TestCommand}, 66 | test_command="test", 67 | entry_points={ 68 | "console_scripts": [ 69 | "textractor = textractor.cli.cli:textractor_cli", 70 | ], 71 | }, 72 | ) 73 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/amzn_q2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/amzn_q2.png -------------------------------------------------------------------------------- /tests/fixtures/fake_id.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/fake_id.png -------------------------------------------------------------------------------- /tests/fixtures/form.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/form.png -------------------------------------------------------------------------------- /tests/fixtures/form_1005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/form_1005.png -------------------------------------------------------------------------------- /tests/fixtures/in-table-title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/in-table-title.png -------------------------------------------------------------------------------- /tests/fixtures/invalid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/invalid.pdf -------------------------------------------------------------------------------- /tests/fixtures/invoice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/invoice.png -------------------------------------------------------------------------------- /tests/fixtures/matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/matrix.png -------------------------------------------------------------------------------- /tests/fixtures/multiline_cells.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/multiline_cells.jpeg -------------------------------------------------------------------------------- /tests/fixtures/patient_intake_form_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/patient_intake_form_sample.png -------------------------------------------------------------------------------- /tests/fixtures/paystub.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/paystub.jpg -------------------------------------------------------------------------------- /tests/fixtures/paystub_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/paystub_header.png -------------------------------------------------------------------------------- /tests/fixtures/paystub_single_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/paystub_single_table.png -------------------------------------------------------------------------------- /tests/fixtures/paystub_tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/paystub_tables.png -------------------------------------------------------------------------------- /tests/fixtures/reading_order.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/reading_order.pdf -------------------------------------------------------------------------------- /tests/fixtures/receipt.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/receipt.jpg -------------------------------------------------------------------------------- /tests/fixtures/receipt_no_summary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/receipt_no_summary.png -------------------------------------------------------------------------------- /tests/fixtures/resume.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/resume.png -------------------------------------------------------------------------------- /tests/fixtures/sample-invoice.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/sample-invoice.pdf -------------------------------------------------------------------------------- /tests/fixtures/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/screenshot.png -------------------------------------------------------------------------------- /tests/fixtures/signature.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/signature.jpg -------------------------------------------------------------------------------- /tests/fixtures/single-page-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/single-page-1.png -------------------------------------------------------------------------------- /tests/fixtures/single-page-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/single-page-2.png -------------------------------------------------------------------------------- /tests/fixtures/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/test.png -------------------------------------------------------------------------------- /tests/fixtures/textractor-multipage-doc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/textractor-multipage-doc.pdf -------------------------------------------------------------------------------- /tests/fixtures/textractor-singlepage-doc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/textractor-singlepage-doc.pdf -------------------------------------------------------------------------------- /tests/fixtures/titanic.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/titanic.webp -------------------------------------------------------------------------------- /tests/fixtures/tutorial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/tutorial.pdf -------------------------------------------------------------------------------- /tests/fixtures/vbat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/vbat.png -------------------------------------------------------------------------------- /tests/fixtures/vbat2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/vbat2.png -------------------------------------------------------------------------------- /tests/invoice_sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/invoice_sample.pdf -------------------------------------------------------------------------------- /tests/test_analyze_expense.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import PIL 4 | import unittest 5 | from textractor import Textractor 6 | from textractor.entities.document import Document 7 | from textractor.data.constants import TextractFeatures 8 | from textractor.exceptions import InvalidProfileNameError, NoImageException, S3FilePathMissing 9 | 10 | from .utils import get_fixture_path 11 | 12 | class TestTextractorAnalyzeExpense(unittest.TestCase): 13 | def setUp(self): 14 | # insert credentials and filepaths here to run test 15 | self.profile_name = "default" 16 | self.current_directory = os.path.abspath(os.path.dirname(__file__)) 17 | self.image_path = os.path.join(self.current_directory, "fixtures/receipt.jpg") 18 | self.image = PIL.Image.open(self.image_path) 19 | 20 | if self.profile_name is None: 21 | raise InvalidProfileNameError( 22 | "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py." 23 | ) 24 | if os.environ.get("CALL_TEXTRACT"): 25 | self.extractor = Textractor( 26 | profile_name=self.profile_name, kms_key_id="" 27 | ) 28 | 29 | def test_analyze_expense_from_path(self): 30 | # Testing local single image input 31 | if os.environ.get("CALL_TEXTRACT"): 32 | document = self.extractor.analyze_expense(file_source=self.image_path) 33 | with open(get_fixture_path(), "w") as f: 34 | json.dump(document.response, f) 35 | else: 36 | document = Document.open(get_fixture_path()) 37 | 38 | self.assertIsInstance(document, Document) 39 | self.assertEqual(len(document.pages), 1) 40 | self.assertEqual(document.expense_documents[0].summary_fields.TOTAL[0].value.text, "$1810.46") 41 | self.assertEqual(len(document.expense_documents[0].summary_groups.VENDOR), 2) 42 | self.assertEqual(len(document.expense_documents[0].line_items_groups[0].to_pandas()), 4, 43 | "There are 4 line item in the receipts") 44 | 45 | def test_analyze_expense_from_image(self): 46 | # Testing local single image input 47 | if os.environ.get("CALL_TEXTRACT"): 48 | document = self.extractor.analyze_expense(file_source=self.image) 49 | with open(get_fixture_path(), "w") as f: 50 | json.dump(document.response, f) 51 | else: 52 | document = Document.open(get_fixture_path()) 53 | 54 | self.assertIsInstance(document, Document) 55 | self.assertEqual(len(document.pages), 1) 56 | self.assertEqual(document.expense_documents[0].summary_fields.TOTAL[0].value.text, "$1810.46") 57 | self.assertEqual(len(document.expense_documents[0].summary_groups.VENDOR), 2) 58 | self.assertEqual(len(document.expense_documents[0].line_items_groups[0].to_pandas()), 4, 59 | "There are 4 line item in the receipts") 60 | 61 | 62 | class TestTextractorAnalyzeExpenseNoSummary(unittest.TestCase): 63 | def setUp(self): 64 | # insert credentials and filepaths here to run test 65 | self.profile_name = "default" 66 | self.current_directory = os.path.abspath(os.path.dirname(__file__)) 67 | self.image_path = os.path.join(self.current_directory, "fixtures/receipt_no_summary.png") 68 | 69 | if self.profile_name is None: 70 | raise InvalidProfileNameError( 71 | "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py." 72 | ) 73 | if os.environ.get("CALL_TEXTRACT"): 74 | self.extractor = Textractor( 75 | profile_name=self.profile_name, kms_key_id="" 76 | ) 77 | 78 | def test_analyze_expense_no_summary_fields(self): 79 | """Correctly load expense line items where no summary fields were recognized 80 | 81 | Per: https://github.com/aws-samples/amazon-textract-textractor/issues/370 82 | """ 83 | if os.environ.get("CALL_TEXTRACT"): 84 | document = self.extractor.analyze_expense(file_source=self.image_path) 85 | with open(get_fixture_path(), "w") as f: 86 | json.dump(document.response, f) 87 | else: 88 | document = Document.open(get_fixture_path()) 89 | 90 | self.assertIsInstance(document, Document) 91 | self.assertEqual(len(document.expense_documents), 1) 92 | self.assertGreater(len(document.expense_documents[0].line_items_groups), 0) 93 | 94 | if __name__ == "__main__": 95 | test = TestTextractorAnalyzeExpense() 96 | test.setUp() 97 | test.test_analyze_expense_from_path() -------------------------------------------------------------------------------- /tests/test_analyze_id.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import PIL 4 | import unittest 5 | from tests.utils import get_fixture_path 6 | from textractor import Textractor 7 | from textractor.entities.document import Document 8 | from textractor.data.constants import TextractFeatures, AnalyzeIDFields 9 | from textractor.exceptions import InvalidProfileNameError, NoImageException, S3FilePathMissing 10 | 11 | from .utils import save_document_to_fixture_path 12 | 13 | class TestTextractorAnalyzeID(unittest.TestCase): 14 | def setUp(self): 15 | # insert credentials and filepaths here to run test 16 | self.profile_name = "default" 17 | self.current_directory = os.path.abspath(os.path.dirname(__file__)) 18 | self.image_path = os.path.join(self.current_directory, "fixtures/fake_id.png") 19 | self.image = PIL.Image.open(os.path.join(self.current_directory, "fixtures/fake_id.png")) 20 | 21 | if self.profile_name is None: 22 | raise InvalidProfileNameError( 23 | "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py." 24 | ) 25 | if os.environ.get("CALL_TEXTRACT"): 26 | self.extractor = Textractor( 27 | profile_name=self.profile_name, kms_key_id="" 28 | ) 29 | 30 | def test_analyze_id_from_path(self): 31 | # Testing local single image input 32 | if os.environ.get("CALL_TEXTRACT"): 33 | document = self.extractor.analyze_id( 34 | file_source=self.image_path, 35 | ) 36 | with open(get_fixture_path(), "w") as f: 37 | json.dump(document.response, f) 38 | else: 39 | document = Document.open(get_fixture_path()) 40 | 41 | self.assertIsInstance(document, Document) 42 | self.assertEqual(len(document.identity_documents), 1) 43 | self.assertEqual(len(document.identity_documents[0].fields), 21) 44 | self.assertEqual(document.identity_documents[0].get(AnalyzeIDFields.FIRST_NAME), "GARCIA") 45 | self.assertEqual(document.identity_documents[0][AnalyzeIDFields.FIRST_NAME], "GARCIA") 46 | 47 | def test_analyze_id_from_image(self): 48 | # Testing local single image input 49 | if os.environ.get("CALL_TEXTRACT"): 50 | document = self.extractor.analyze_id( 51 | file_source=self.image, 52 | ) 53 | with open(get_fixture_path(), "w") as f: 54 | json.dump(document.response, f) 55 | else: 56 | document = Document.open(get_fixture_path()) 57 | 58 | self.assertIsInstance(document, Document) 59 | self.assertEqual(len(document.identity_documents), 1) 60 | self.assertEqual(len(document.identity_documents[0].fields), 21) 61 | self.assertEqual(document.identity_documents[0].get("FIRST_NAME"), "GARCIA") 62 | self.assertEqual(document.identity_documents[0]["FIRST_NAME"], "GARCIA") 63 | 64 | if __name__ == "__main__": 65 | test = TestTextractorAnalyzeID() 66 | test.setUp() 67 | test.test_analyze_id_from_path() -------------------------------------------------------------------------------- /tests/test_bbox.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy 3 | from textractor.entities.bbox import BoundingBox 4 | 5 | class TestBoundingBox(unittest.TestCase): 6 | def test_bbox(self): 7 | dims = {"Width": 3, "Height": 4, "Left": 1, "Top": 2} 8 | bbox = BoundingBox.from_normalized_dict(dims, spatial_object=None) 9 | 10 | self.assertTrue(isinstance(bbox.as_denormalized_numpy(), numpy.ndarray)) 11 | self.assertEqual(bbox.__repr__(), "x: 1, y: 2, width: 3, height: 4") 12 | -------------------------------------------------------------------------------- /tests/test_layout.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import unittest 4 | import PIL 5 | from tests.utils import get_fixture_path 6 | from textractor import Textractor 7 | from textractor.entities.document import Document 8 | from textractor.entities.word import Word 9 | from textractor.entities.line import Line 10 | from textractor.entities.page import Page 11 | from textractor.entities.table import Table 12 | from textractor.entities.value import Value 13 | from textractor.data.constants import TableFormat 14 | from textractor.entities.key_value import KeyValue 15 | from textractor.visualizers.entitylist import EntityList 16 | from textractor.exceptions import InvalidProfileNameError 17 | from textractor.entities.selection_element import SelectionElement 18 | from textractor.data.constants import TextTypes, SimilarityMetric, TextractFeatures, Direction, DirectionalFinderType 19 | 20 | from .utils import save_document_to_fixture_path 21 | 22 | class TestLayout(unittest.TestCase): 23 | def test_layout(self): 24 | profile_name = "default" 25 | current_directory = os.path.abspath(os.path.dirname(__file__)) 26 | 27 | if profile_name is None: 28 | raise InvalidProfileNameError( 29 | "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py." 30 | ) 31 | 32 | if os.environ.get("CALL_TEXTRACT"): 33 | extractor = Textractor(profile_name=profile_name, kms_key_id="") 34 | document = extractor.analyze_document( 35 | file_source=os.path.join(current_directory, "fixtures/paystub.jpg"), 36 | features=[TextractFeatures.LAYOUT, TextractFeatures.TABLES, TextractFeatures.FORMS], 37 | ) 38 | with open(get_fixture_path(), "w") as f: 39 | json.dump(document.response, f) 40 | else: 41 | document = Document.open(get_fixture_path()) 42 | 43 | print(document.text) 44 | -------------------------------------------------------------------------------- /tests/test_line.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from textractor.entities.line import Line 4 | from textractor.entities.word import Word 5 | from textractor.data.constants import TextTypes 6 | from textractor.entities.bbox import BoundingBox 7 | from textractor.visualizers.entitylist import EntityList 8 | 9 | class TestLine(unittest.TestCase): 10 | def setUp(self): 11 | self.word_bb_1 = { 12 | "Width": 0.10809839516878128, 13 | "Height": 0.01363567914813757, 14 | "Left": 0.036161474883556366, 15 | "Top": 0.03439946100115776, 16 | } 17 | self.word_bb_2 = { 18 | "Width": 0.18033172190189362, 19 | "Height": 0.01742148958146572, 20 | "Left": 0.22032427787780762, 21 | "Top": 0.03645794093608856, 22 | } 23 | self.word_bb_3 = { 24 | "Width": 0.03744738921523094, 25 | "Height": 0.016524378210306168, 26 | "Left": 0.4087739884853363, 27 | "Top": 0.0368686243891716, 28 | } 29 | self.line_bb = { 30 | "Width": 0.3, 31 | "Height": 0.01742148958146572, 32 | "Left": 0.036161474883556366, 33 | "Top": 0.03439946100115776, 34 | } 35 | 36 | self.word_1 = Word( 37 | entity_id="word-id-1", 38 | bbox=BoundingBox.from_normalized_dict(self.word_bb_1, spatial_object=None), 39 | text="TEST", 40 | text_type=TextTypes.PRINTED, 41 | ) 42 | self.word_2 = Word( 43 | entity_id="word-id-2", 44 | bbox=BoundingBox.from_normalized_dict(self.word_bb_2, spatial_object=None), 45 | text="WORDS", 46 | text_type=TextTypes.HANDWRITING, 47 | ) 48 | self.word_3 = Word( 49 | entity_id="word-id-3", 50 | bbox=BoundingBox.from_normalized_dict(self.word_bb_3, spatial_object=None), 51 | text="ADDED", 52 | text_type=TextTypes.PRINTED, 53 | ) 54 | 55 | self.line = Line( 56 | "line-id", 57 | BoundingBox.from_normalized_dict(self.line_bb, spatial_object=None), 58 | [self.word_1, self.word_2, self.word_3], 59 | ) 60 | 61 | 62 | def test_get_words_by_type(self): 63 | """Test case to filter words of the Line by their type""" 64 | self.assertEqual(self.line.get_words_by_type(TextTypes.PRINTED), EntityList([self.word_1, self.word_3])) 65 | self.assertEqual(self.line.get_words_by_type(TextTypes.HANDWRITING), EntityList([self.word_2])) 66 | 67 | 68 | def test_get_text(self): 69 | """Test case setter for the text attribute""" 70 | self.assertEqual(self.line.text, "TEST WORDS ADDED") 71 | 72 | 73 | def test_set_page(self): 74 | """Test case setter for the page attribute""" 75 | self.line.page = 2 76 | self.assertEqual(self.line.page, 2) 77 | 78 | 79 | def test_set_page_id(self): 80 | """Test case setter for the page_id attribute""" 81 | self.line.page_id = "page-id" 82 | self.assertEqual(self.line.page_id, "page-id") 83 | 84 | 85 | def test_repr(self): 86 | """Test case setter for the repr function""" 87 | self.assertEqual(self.line.__repr__(), "TEST WORDS ADDED") 88 | -------------------------------------------------------------------------------- /tests/test_parse_no_fail.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | import PIL 5 | import unittest 6 | import boto3 7 | import uuid 8 | import logging 9 | from tests.utils import get_fixture_path 10 | 11 | from textractor import Textractor 12 | from textractor.data.constants import TextractFeatures 13 | from textractor.entities.document import Document 14 | from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing 15 | from textractor.utils.s3_utils import upload_to_s3, delete_from_s3 16 | 17 | class TestParseNoFail(unittest.TestCase): 18 | """The tests below are fuzzing tests and are disabled in the CI test suite. 19 | They are meant to generate random permutations of the input JSON response and 20 | ensure that the parser does not raise any exception. Their results may be flaky 21 | due to the randomness 22 | 23 | :param unittest: _description_ 24 | :type unittest: _type_ 25 | """ 26 | def setUp(self): 27 | # insert credentials and filepaths here to run test 28 | self.profile_name = "default" 29 | self.current_directory = os.path.abspath(os.path.dirname(__file__)) 30 | self.saved_api_responses_directory = os.path.join(self.current_directory, "fixtures", "saved_api_responses") 31 | self.deletion_rate = 0.5 32 | 33 | def test_parse_no_fail(self): 34 | for asset in os.listdir(self.saved_api_responses_directory): 35 | # Testing that no asset causes the output to contain duplicate words 36 | with open(os.path.join(self.saved_api_responses_directory, asset)) as f: 37 | response = json.load(f) 38 | 39 | if not "Blocks" in response: 40 | continue 41 | 42 | index_to_remove = [] 43 | for i in range(len(response["Blocks"])): 44 | if response["Blocks"][i]["BlockType"] != "PAGE" and random.random() <= self.deletion_rate: 45 | index_to_remove.append(i) 46 | 47 | for i in sorted(index_to_remove, reverse=True): 48 | response["Blocks"].pop(i) 49 | 50 | document = Document.open(response) 51 | document.get_text_and_words() 52 | -------------------------------------------------------------------------------- /tests/test_queries.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from tests.utils import get_fixture_path 4 | from textractor import Textractor 5 | from textractor.entities.document import Document 6 | from textractor.exceptions import InputError, InvalidProfileNameError 7 | from textractor.data.constants import TextractFeatures 8 | 9 | from .utils import save_document_to_fixture_path 10 | 11 | class QueriesTests(unittest.TestCase): 12 | def test_queries_as_strings(self): 13 | profile_name = "default" 14 | current_directory = os.path.abspath(os.path.dirname(__file__)) 15 | 16 | if profile_name is None: 17 | raise InvalidProfileNameError( 18 | "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py." 19 | ) 20 | 21 | if os.environ.get("CALL_TEXTRACT"): 22 | extractor = Textractor(profile_name=profile_name, kms_key_id="") 23 | document = extractor.analyze_document( 24 | file_source=os.path.join(current_directory, "fixtures/single-page-1.png"), 25 | features=[TextractFeatures.QUERIES], 26 | queries=[ 27 | "What is the name of the package?", 28 | "What is the title of the document?", 29 | ], 30 | ) 31 | else: 32 | document = Document.open(get_fixture_path()) 33 | 34 | self.assertEqual(len(document.queries), 2) 35 | self.assertEqual(document.queries[0].result.answer, "Textractor") 36 | self.assertEqual(document.queries[1].result.answer, "Textractor Test Document") 37 | 38 | def test_bad_queries_as_strings(self): 39 | profile_name = "default" 40 | current_directory = os.path.abspath(os.path.dirname(__file__)) 41 | 42 | if profile_name is None: 43 | raise InvalidProfileNameError( 44 | "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py." 45 | ) 46 | 47 | if os.environ.get("CALL_TEXTRACT"): 48 | extractor = Textractor(profile_name=profile_name, kms_key_id="") 49 | document = extractor.analyze_document( 50 | file_source=os.path.join(current_directory, "fixtures/single-page-1.png"), 51 | features=[TextractFeatures.QUERIES], 52 | queries=[ 53 | "Lorem ipsum?", 54 | "The quick brown fox jumps over the lazy dog?", 55 | ], 56 | ) 57 | else: 58 | document = Document.open(get_fixture_path()) 59 | 60 | self.assertEqual(len(document.queries), 2) 61 | self.assertEqual(document.queries[0].result, None) 62 | self.assertEqual(document.queries[1].result, None) 63 | 64 | @unittest.skipIf(not os.environ.get("CALL_TEXTRACT"), "Asynchronous requests can't be processed without calling Textract") 65 | def test_query_feature_without_queries(self): 66 | profile_name = "default" 67 | current_directory = os.path.abspath(os.path.dirname(__file__)) 68 | 69 | if profile_name is None: 70 | raise InvalidProfileNameError( 71 | "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py." 72 | ) 73 | 74 | extractor = Textractor(profile_name=profile_name, kms_key_id="") 75 | with self.assertRaises(InputError): 76 | document = extractor.analyze_document( 77 | file_source=os.path.join(current_directory, "fixtures/single-page-1.png"), 78 | features=[TextractFeatures.TABLES], 79 | queries=[ 80 | "Lorem ipsum?", 81 | "The quick brown fox jumps over the lazy dog?", 82 | ], 83 | ) 84 | -------------------------------------------------------------------------------- /tests/test_selection_element.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from textractor.data.constants import SelectionStatus, SELECTED 4 | from textractor.entities.bbox import BoundingBox 5 | from textractor.entities.selection_element import SelectionElement 6 | 7 | 8 | class TestSelectionElement(unittest.TestCase): 9 | def setUp(self): 10 | self.checkbox_bb = { 11 | "Width": 0.09679746627807617, 12 | "Height": 0.008036591112613678, 13 | "Left": 0.08719838410615921, 14 | "Top": 0.5354593992233276, 15 | } 16 | 17 | 18 | def test_is_selected(self): 19 | """Test case to return the selection status of the checkbox""" 20 | checkbox = SelectionElement( 21 | entity_id="checkbox-id", 22 | bbox=BoundingBox.from_normalized_dict(self.checkbox_bb, spatial_object=None), 23 | status=SelectionStatus.SELECTED, 24 | confidence=100, 25 | ) 26 | self.assertTrue(checkbox.is_selected()) 27 | 28 | 29 | def test_words(self): 30 | """Test case to return the words of the checkbox""" 31 | checkbox = SelectionElement( 32 | entity_id="checkbox-id", 33 | bbox=BoundingBox.from_normalized_dict(self.checkbox_bb, spatial_object=None), 34 | status=SelectionStatus.SELECTED, 35 | confidence=100, 36 | ) 37 | self.assertEqual(checkbox.words, []) 38 | 39 | 40 | def test_repr(self): 41 | """Test case to return the selection status of the checkbox as string""" 42 | checkbox = SelectionElement( 43 | entity_id="checkbox-id", 44 | bbox=BoundingBox.from_normalized_dict(self.checkbox_bb, spatial_object=None), 45 | status=SelectionStatus.SELECTED, 46 | confidence=100, 47 | ) 48 | self.assertEqual(checkbox.__repr__(), "[X]") 49 | 50 | 51 | def test_set_page(self): 52 | """Test case setter for the page attribute""" 53 | checkbox = SelectionElement( 54 | entity_id="checkbox-id", 55 | bbox=BoundingBox.from_normalized_dict(self.checkbox_bb, spatial_object=None), 56 | status=SelectionStatus.SELECTED, 57 | confidence=100, 58 | ) 59 | checkbox.page = 2 60 | self.assertEqual(checkbox.page, 2) 61 | 62 | 63 | def test_set_page_id(self): 64 | """Test case setter for the page_id attribute""" 65 | checkbox = SelectionElement( 66 | entity_id="checkbox-id", 67 | bbox=BoundingBox.from_normalized_dict(self.checkbox_bb, spatial_object=None), 68 | status=SelectionStatus.SELECTED, 69 | confidence=100, 70 | ) 71 | checkbox.page_id = "page-id" 72 | self.assertEqual(checkbox.page_id, "page-id") 73 | -------------------------------------------------------------------------------- /tests/test_signature.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from tests.utils import get_fixture_path 4 | from textractor import Textractor 5 | from textractor.entities.document import Document 6 | from textractor.exceptions import InvalidProfileNameError 7 | from textractor.data.constants import TextractFeatures 8 | 9 | from .utils import save_document_to_fixture_path 10 | 11 | class TestSignature(unittest.TestCase): 12 | def test_signature(self): 13 | # Insert credentials here to run test 14 | profile_name = "default" 15 | current_directory = os.path.abspath(os.path.dirname(__file__)) 16 | 17 | if profile_name is None: 18 | raise InvalidProfileNameError( 19 | "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py." 20 | ) 21 | 22 | if os.environ.get("CALL_TEXTRACT"): 23 | extractor = Textractor( 24 | profile_name=profile_name, kms_key_id="" 25 | ) 26 | document = extractor.analyze_document( 27 | file_source=os.path.join(current_directory, "fixtures/signature.jpg"), 28 | features=[TextractFeatures.SIGNATURES], 29 | save_image=False, 30 | ) 31 | save_document_to_fixture_path(document) 32 | else: 33 | document = Document.open(get_fixture_path()) 34 | 35 | self.assertEqual(len(document.signatures), 1) 36 | self.assertEqual(len(document.pages[0].signatures), 1) 37 | -------------------------------------------------------------------------------- /tests/test_value.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from textractor.data.constants import TextTypes 4 | from textractor.entities.value import Value 5 | from textractor.entities.word import Word 6 | from textractor.entities.bbox import BoundingBox 7 | from textractor.visualizers.entitylist import EntityList 8 | 9 | class TestValue(unittest.TestCase): 10 | def setUp(self): 11 | self.word_bb_1 = { 12 | "Width": 0.10809839516878128, 13 | "Height": 0.01363567914813757, 14 | "Left": 0.036161474883556366, 15 | "Top": 0.03439946100115776, 16 | } 17 | self.word_bb_2 = { 18 | "Width": 0.18033172190189362, 19 | "Height": 0.01742148958146572, 20 | "Left": 0.22032427787780762, 21 | "Top": 0.03645794093608856, 22 | } 23 | self.word_bb_3 = { 24 | "Width": 0.03744738921523094, 25 | "Height": 0.016524378210306168, 26 | "Left": 0.4087739884853363, 27 | "Top": 0.0368686243891716, 28 | } 29 | self.value_bb = { 30 | "Width": 0.02524515800178051, 31 | "Height": 0.01746263913810253, 32 | "Left": 0.18779051303863525, 33 | "Top": 0.4229613244533539, 34 | } 35 | 36 | self.word_1 = Word( 37 | entity_id="word-id-1", 38 | bbox=BoundingBox.from_normalized_dict(self.word_bb_1, spatial_object=None), 39 | text="TEST", 40 | text_type=TextTypes.PRINTED, 41 | ) 42 | self.word_2 = Word( 43 | entity_id="word-id-2", 44 | bbox=BoundingBox.from_normalized_dict(self.word_bb_2, spatial_object=None), 45 | text="WORDS", 46 | text_type=TextTypes.HANDWRITING, 47 | ) 48 | self.word_3 = Word( 49 | entity_id="word-id-3", 50 | bbox=BoundingBox.from_normalized_dict(self.word_bb_3, spatial_object=None), 51 | text="ADDED", 52 | text_type=TextTypes.PRINTED, 53 | ) 54 | 55 | self.value = Value( 56 | entity_id="value-id", 57 | bbox=BoundingBox.from_normalized_dict(self.value_bb, spatial_object=None), 58 | ) 59 | self.word_objs = [self.word_1, self.word_2, self.word_3] 60 | self.value.words = self.word_objs 61 | self.value.key_id = "key-id" 62 | self.value.contains_checkbox = False 63 | self.value.page = 2 64 | self.value.page_id = "page-id" 65 | 66 | 67 | def test_words(self): 68 | """Test case to add words to the Value field of a key-value pair""" 69 | self.assertEqual(self.value.words, EntityList(self.word_objs)) 70 | 71 | 72 | def test_key_id(self): 73 | """Test case to access Key ID of a key-value pair""" 74 | self.assertEqual(self.value.key_id, "key-id") 75 | 76 | 77 | def test_contains_checkbox(self): 78 | self.assertFalse(self.value.contains_checkbox) 79 | 80 | 81 | def test_set_page(self): 82 | """Test case setter for the page attribute""" 83 | self.assertEqual(self.value.page, 2) 84 | 85 | 86 | def test_set_page_id(self): 87 | """Test case setter for the page_id attribute""" 88 | self.assertEqual(self.value.page_id, "page-id") 89 | 90 | 91 | def test_get_words_by_type(self): 92 | """Test case to retrieve words of a specific type in the Value field of a key-value pair""" 93 | self.assertEqual( 94 | self.value.get_words_by_type(text_type=TextTypes.PRINTED), 95 | EntityList([self.word_1, self.word_3]) 96 | ) 97 | self.assertEqual( 98 | self.value.get_words_by_type(text_type=TextTypes.HANDWRITING), 99 | EntityList([self.word_2]) 100 | ) 101 | 102 | 103 | def test_repr(self): 104 | """Test case to retrieve words of the Value field in a key-value pair as text""" 105 | self.assertEqual(self.value.__repr__(), "TEST WORDS ADDED") 106 | -------------------------------------------------------------------------------- /tests/test_visualizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import PIL 3 | import unittest 4 | import boto3 5 | import uuid 6 | from tests.utils import get_fixture_path 7 | 8 | from textractor import Textractor 9 | from textractor.entities.document import Document 10 | from textractor.entities.lazy_document import LazyDocument 11 | from textractor.data.constants import TextractFeatures 12 | from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing 13 | 14 | 15 | class TestTextractor(unittest.TestCase): 16 | def setUp(self): 17 | # insert credentials and filepaths here to run test 18 | if os.environ.get("CALL_TEXTRACT"): 19 | self.profile_name = "default" 20 | self.current_directory = os.path.abspath(os.path.dirname(__file__)) 21 | self.extractor = Textractor( 22 | profile_name=self.profile_name, kms_key_id="" 23 | ) 24 | 25 | @unittest.skipIf(not os.environ.get("CALL_TEXTRACT"), "This test only work with CALL_TEXTRACT enabled") 26 | def test_detect_document_text(self): 27 | # Testing local single image input 28 | document = self.extractor.detect_document_text( 29 | file_source=os.path.join(self.current_directory, "fixtures/single-page-1.png"), 30 | ) 31 | 32 | out1 = document.words.visualize() 33 | out2 = document.words.visualize(with_text=False) 34 | out3 = (document.words + document.lines).visualize() 35 | 36 | @unittest.skipIf(not os.environ.get("CALL_TEXTRACT"), "This test only work with CALL_TEXTRACT enabled") 37 | def test_textractor_analyze_document(self): 38 | # Testing analyze_document() with local single image input 39 | document = self.extractor.analyze_document( 40 | file_source=os.path.join(self.current_directory, "fixtures/amzn_q2.png"), 41 | features=[TextractFeatures.TABLES, TextractFeatures.FORMS], 42 | ) 43 | 44 | out1 = document.tables.visualize() 45 | out2 = document.tables[0].visualize(with_text=False) 46 | out3 = document.pages[0].visualize() -------------------------------------------------------------------------------- /tests/test_word.py: -------------------------------------------------------------------------------- 1 | """Tests for all Word class methods.""" 2 | 3 | import unittest 4 | 5 | from textractor.entities.bbox import BoundingBox 6 | from textractor.data.constants import TextTypes 7 | from textractor.entities.word import Word 8 | 9 | class TestWord(unittest.TestCase): 10 | def setUp(self): 11 | self.bounding_box = { 12 | "Width": 0.10809839516878128, 13 | "Height": 0.01363567914813757, 14 | "Left": 0.036161474883556366, 15 | "Top": 0.03439946100115776, 16 | } 17 | 18 | 19 | def test_set_text(self): 20 | """Test case setter for the text attribute""" 21 | word = Word( 22 | entity_id="word-id", 23 | bbox=BoundingBox.from_normalized_dict(self.bounding_box, spatial_object=None), 24 | ) 25 | word.text = "word-test" 26 | self.assertEqual(word.text, "word-test") 27 | 28 | 29 | def test_set_text_type(self): 30 | """Test case setter for the text type attribute""" 31 | word = Word( 32 | entity_id="word-id", 33 | bbox=BoundingBox.from_normalized_dict(self.bounding_box, spatial_object=None), 34 | ) 35 | word.text_type = TextTypes.HANDWRITING 36 | self.assertEqual(word.text_type, TextTypes.HANDWRITING) 37 | 38 | 39 | def test_set_page(self): 40 | """Test case setter for the page attribute""" 41 | word = Word( 42 | entity_id="word-id", 43 | bbox=BoundingBox.from_normalized_dict(self.bounding_box, spatial_object=None), 44 | ) 45 | word.page = 2 46 | self.assertEqual(word.page, 2) 47 | 48 | 49 | def test_set_page_id(self): 50 | """Test case setter for the page_id attribute""" 51 | word = Word( 52 | entity_id="word-id", 53 | bbox=BoundingBox.from_normalized_dict(self.bounding_box, spatial_object=None), 54 | ) 55 | word.page_id = "page-id" 56 | self.assertEqual(word.page_id, "page-id") 57 | 58 | 59 | def test_repr(self): 60 | """Test case setter for the repr function""" 61 | word = Word( 62 | entity_id="word-id", 63 | bbox=BoundingBox.from_normalized_dict(self.bounding_box, spatial_object=None), 64 | ) 65 | word.text = "word-test" 66 | self.assertEqual(word.__repr__(), "word-test") 67 | -------------------------------------------------------------------------------- /tests/test_word_ordering.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import os 3 | import PIL 4 | import unittest 5 | import json 6 | from tests.utils import get_fixture_path 7 | 8 | from textractor import Textractor 9 | from textractor.entities.document import Document 10 | from textractor.entities.lazy_document import LazyDocument 11 | from textractor.visualizers.entitylist import EntityList 12 | from textractor.data.constants import TextractFeatures 13 | from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing 14 | from textractor.utils.s3_utils import upload_to_s3, delete_from_s3 15 | 16 | class TestWordOrdering(unittest.TestCase): 17 | def setUp(self): 18 | # insert credentials and filepaths here to run test 19 | self.profile_name = "default" 20 | self.bucket_name = os.environ.get("S3_BUCKET", "textractor-tests") 21 | if os.environ.get("CALL_TEXTRACT"): 22 | self.s3_client = boto3.session.Session( 23 | profile_name=self.profile_name 24 | ).client("s3", region_name="us-west-2") 25 | 26 | if self.profile_name is None: 27 | raise InvalidProfileNameError( 28 | "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py." 29 | ) 30 | self.current_directory = os.path.abspath(os.path.dirname(__file__)) 31 | self.extractor = Textractor( 32 | profile_name=self.profile_name, kms_key_id="" 33 | ) 34 | 35 | def test_word_ordering_in_cell(self): 36 | if os.environ.get("CALL_TEXTRACT"): 37 | document = self.extractor.analyze_document( 38 | file_source=os.path.join(self.current_directory, "fixtures/reading_order.pdf"), 39 | features=[TextractFeatures.TABLES] 40 | ) 41 | with open(get_fixture_path(), "w") as fh: 42 | json.dump(document.response, fh) 43 | else: 44 | document = Document.open(get_fixture_path()) 45 | 46 | self.assertEqual(document.tables[0].table_cells[0].text.strip(), "Are those Words in order?") 47 | 48 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import json 3 | import os 4 | 5 | def get_fixture_path(): 6 | """Uses reflection to get correct saved response file 7 | 8 | :return: Path to the saved response file for the calling function 9 | :rtype: str 10 | """ 11 | return os.path.join( 12 | os.path.abspath(os.path.dirname(__file__)), 13 | f"fixtures/saved_api_responses/{inspect.currentframe().f_back.f_code.co_name}.json" 14 | ) 15 | 16 | def save_document_to_fixture_path(document): 17 | with open( 18 | os.path.join( 19 | os.path.abspath(os.path.dirname(__file__)), 20 | f"fixtures/saved_api_responses/{inspect.currentframe().f_back.f_code.co_name}.json" 21 | ), 22 | "w" 23 | ) as f: 24 | json.dump(document.response, f) -------------------------------------------------------------------------------- /textractor/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.9.2" 2 | 3 | from .textractor import Textractor 4 | -------------------------------------------------------------------------------- /textractor/cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/cli/__init__.py -------------------------------------------------------------------------------- /textractor/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/data/__init__.py -------------------------------------------------------------------------------- /textractor/data/html_linearization_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | 4 | from textractor.data.text_linearization_config import TextLinearizationConfig 5 | 6 | @dataclass 7 | class HTMLLinearizationConfig(TextLinearizationConfig): 8 | """ 9 | This :class:`HTMLLinearizationConfig` is a convenience configuration for converting a Document or DocumentEntity to HTML. 10 | For a description of the parameters see :class:`TextLinearizationConfig`. 11 | """ 12 | 13 | title_prefix: str = "

" 14 | 15 | title_suffix: str = "

" 16 | 17 | header_prefix: str = "

" 18 | 19 | header_suffix: str = "

" 20 | 21 | section_header_prefix: str = "

" 22 | 23 | section_header_suffix: str = "

" 24 | 25 | text_prefix: str = "

" 26 | 27 | text_suffix: str = "

" 28 | 29 | entity_layout_prefix: str = "

" 30 | 31 | entity_layout_suffix: str = "

" 32 | 33 | table_prefix: str = "" 34 | 35 | table_suffix: str = "
" 36 | 37 | table_row_prefix: str = "" 38 | 39 | table_row_suffix: str = "" 40 | 41 | table_cell_header_prefix: str = "" 42 | 43 | table_cell_header_suffix: str = "" 44 | 45 | table_cell_prefix: str = "" 46 | 47 | table_cell_suffix: str = "" 48 | 49 | table_column_separator: str = "" 50 | 51 | table_linearization_format: str = "html" 52 | 53 | table_add_title_as_caption: bool = True 54 | 55 | table_add_footer_as_paragraph: bool = True 56 | 57 | table_column_separator: str = "" 58 | 59 | list_layout_prefix: str = "
" 60 | 61 | list_layout_suffix: str = "
" 62 | 63 | table_layout_prefix: str = "
" 64 | 65 | table_layout_suffix: str = "
" 66 | 67 | key_value_layout_prefix: str = "
" 68 | 69 | key_value_layout_suffix: str = "
" 70 | 71 | figure_layout_prefix: str = "
" 72 | 73 | figure_layout_suffix: str = "
" 74 | 75 | footer_layout_prefix: str = "
" 76 | 77 | footer_layout_suffix: str = "
" 78 | 79 | page_num_prefix: str = "
" 80 | 81 | page_num_suffix: str = "
" 82 | 83 | add_ids_to_html_tags: bool = False #: Adds Textract block id to the HTML markup. Only supported for HTML. 84 | 85 | add_short_ids_to_html_tags: bool = False #: Adds the truncated (first 8 characters) Textract block id to the HTML markup. Only supported for HTML 86 | -------------------------------------------------------------------------------- /textractor/data/markdown_linearization_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | 4 | from textractor.data.text_linearization_config import TextLinearizationConfig 5 | 6 | @dataclass 7 | class MarkdownLinearizationConfig(TextLinearizationConfig): 8 | """ 9 | This :class:`MarkdownLinearizationConfig` is a convenience configuration for converting a Document or DocumentEntity to Markdown. 10 | For a description of the parameters see :class:`TextLinearizationConfig`. 11 | """ 12 | 13 | title_prefix: str = "# " 14 | 15 | table_linearization_format: str = "markdown" 16 | 17 | section_header_prefix: str = "## " 18 | 19 | table_remove_column_headers: bool = True -------------------------------------------------------------------------------- /textractor/entities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/entities/__init__.py -------------------------------------------------------------------------------- /textractor/entities/identity_document.py: -------------------------------------------------------------------------------- 1 | """The IdentityDocument class is the object representation of an AnalyzeID response. It is similar to a dictionary. Despite its name it does not inherit from Document as the AnalyzeID response does not contains position information.""" 2 | 3 | import os 4 | from typing import List, Dict, Union 5 | from textractor.data.constants import AnalyzeIDFields 6 | from textractor.entities.bbox import SpatialObject 7 | from textractor.entities.identity_field import IdentityField 8 | 9 | from textractor.exceptions import InputError 10 | 11 | 12 | class IdentityDocument(SpatialObject): 13 | """ 14 | Represents the description of a single ID document. 15 | """ 16 | 17 | def __init__(self, fields=None): 18 | """ 19 | Creates a new document, ideally containing entity objects pertaining to each page. 20 | 21 | :param num_pages: Number of pages in the input Document. 22 | """ 23 | super().__init__(width=0, height=0) 24 | self._fields = IdentityDocument._fields_to_dict(fields) 25 | 26 | @classmethod 27 | def _fields_to_dict(cls, fields: Union[List[IdentityField], Dict[str, dict]]): 28 | if not fields: 29 | return {} 30 | elif isinstance(fields, list) and isinstance(fields[0], IdentityField): 31 | return {id_field.key: id_field for id_field in fields} 32 | elif isinstance(fields, dict): 33 | field_dict = {} 34 | for id_field in fields.values(): 35 | field_dict[id_field["key"]] = IdentityField( 36 | id_field["key"], 37 | id_field["value"], 38 | id_field["confidence"], 39 | ) 40 | return field_dict 41 | else: 42 | raise InputError( 43 | f"fields needs to be a list of IdentityFields or a list of dictionaries, not {type(fields)}" 44 | ) 45 | 46 | @property 47 | def fields(self) -> Dict[str, IdentityField]: 48 | return self._fields 49 | 50 | @fields.setter 51 | def fields(self, fields): 52 | self._fields = fields 53 | 54 | def keys(self) -> List[str]: 55 | keys = [key for key in self._fields.keys()] 56 | return keys 57 | 58 | def values(self) -> List[str]: 59 | values = [field.value for field in self._fields.values()] 60 | return values 61 | 62 | def __getitem__(self, key: Union[str, AnalyzeIDFields]) -> str: 63 | return self._fields[key if isinstance(key, str) else key.value].value 64 | 65 | def get(self, key: Union[str, AnalyzeIDFields]) -> Union[str, None]: 66 | result = self._fields.get(key if isinstance(key, str) else key.value) 67 | if result is None: 68 | return None 69 | return result.value 70 | 71 | def __repr__(self): 72 | return os.linesep.join([f"{str(k)}: {str(v)}" for k, v in self.fields.items()]) 73 | -------------------------------------------------------------------------------- /textractor/entities/identity_field.py: -------------------------------------------------------------------------------- 1 | class IdentityField: 2 | def __init__(self, key, value, confidence): 3 | self._key = key 4 | self._value = value 5 | self._confidence = confidence 6 | 7 | @property 8 | def key(self) -> str: 9 | return self._key 10 | 11 | @property 12 | def value(self) -> str: 13 | return self._value 14 | 15 | @property 16 | def confidence(self) -> float: 17 | return self._confidence 18 | 19 | def __repr__(self) -> str: 20 | return self.value 21 | -------------------------------------------------------------------------------- /textractor/entities/line.py: -------------------------------------------------------------------------------- 1 | """ 2 | Represents a single :class:`Line` Entity within the :class:`Document`. 3 | The Textract API response returns groups of words as LINE BlockTypes. They contain :class:`Word` entities as children. 4 | 5 | This class contains the associated metadata with the :class:`Line` entity including the entity ID, 6 | bounding box information, child words, page number, Page ID and confidence of detection. 7 | """ 8 | 9 | import logging 10 | from typing import List 11 | 12 | from textractor.entities.word import Word 13 | from textractor.data.constants import TextTypes 14 | from textractor.entities.bbox import BoundingBox 15 | from textractor.exceptions import InputError 16 | from textractor.entities.document_entity import DocumentEntity 17 | from textractor.visualizers.entitylist import EntityList 18 | from textractor.utils.html_utils import escape_text 19 | from textractor.data.text_linearization_config import TextLinearizationConfig 20 | 21 | class Line(DocumentEntity): 22 | """ 23 | To create a new :class:`Line` object we need the following: 24 | 25 | :param entity_id: Unique identifier of the Line entity. 26 | :type entity_id: str 27 | :param bbox: Bounding box of the line entity. 28 | :type bbox: BoundingBox 29 | :param words: List of the Word entities present in the line 30 | :type words: list, optional 31 | :param confidence: confidence with which the entity was detected. 32 | :type confidence: float, optional 33 | """ 34 | 35 | def __init__( 36 | self, 37 | entity_id: str, 38 | bbox: BoundingBox, 39 | words: List[Word] = None, 40 | confidence: float = 0, 41 | ): 42 | super().__init__(entity_id, bbox) 43 | if words is not None and len(words) > 0: 44 | self._children: List[Word] = sorted(words, key=lambda x: (x.bbox.x, x.bbox.y)) 45 | else: 46 | self._children = [] 47 | 48 | self._confidence = confidence / 100 49 | self._page = None 50 | self._page_id = None 51 | 52 | @property 53 | def text(self): 54 | """ 55 | :return: Returns the text transcription of the :class:`Line` entity. 56 | :rtype: str 57 | """ 58 | return " ".join([word.text for word in self.words]) 59 | 60 | @property 61 | def words(self): 62 | """ 63 | :return: Returns the line's children 64 | :rtype: List[Word] 65 | """ 66 | return self._children 67 | 68 | def get_text_and_words(self, config: TextLinearizationConfig = TextLinearizationConfig()): 69 | if not self.bbox: 70 | self.bbox = BoundingBox.enclosing_bbox(self.words) 71 | for w in self.words: 72 | w.line_id = self.id 73 | w.line_bbox = self.bbox 74 | return escape_text(self.text, config), self.words 75 | 76 | @property 77 | def page(self): 78 | """ 79 | :return: Returns the page number of the page the :class:`Line` entity is present in. 80 | :rtype: int 81 | """ 82 | return self._page 83 | 84 | @page.setter 85 | def page(self, page_num: int): 86 | """ 87 | Sets the page number attribute of the Line entity. 88 | 89 | :param page_num: Page number where the Line entity exists. 90 | :type page_num: int 91 | """ 92 | self._page = page_num 93 | 94 | @property 95 | def page_id(self) -> str: 96 | """ 97 | :return: Returns the Page ID attribute of the page which the entity belongs to. 98 | :rtype: str 99 | """ 100 | return self._page_id 101 | 102 | @page_id.setter 103 | def page_id(self, page_id: str): 104 | """ 105 | Sets the Page ID of the :class:`Line` entity. 106 | 107 | :param page_id: Page ID of the page the entity belongs to. 108 | :type page_id: str 109 | """ 110 | self._page_id = page_id 111 | 112 | def get_words_by_type(self, text_type: TextTypes = TextTypes.PRINTED) -> List[Word]: 113 | """ 114 | :param text_type: TextTypes.PRINTED or TextTypes.HANDWRITING 115 | :type text_type: TextTypes 116 | :return: Returns EntityList of Word entities that match the input text type. 117 | :rtype: EntityList[Word] 118 | """ 119 | if not isinstance(text_type, TextTypes): 120 | raise InputError( 121 | "text_type parameter should be of TextTypes type. Find input choices from textractor.data.constants" 122 | ) 123 | 124 | if not self.words: 125 | return [] 126 | return EntityList([word for word in self.words if word.text_type == text_type]) 127 | 128 | def __repr__(self): 129 | """ 130 | :return: String representation of the Line entity. 131 | :rtype: str 132 | """ 133 | return " ".join([word.text for word in self.words]) 134 | -------------------------------------------------------------------------------- /textractor/entities/linearizable.py: -------------------------------------------------------------------------------- 1 | """ 2 | :class:`Linearizable` is a class that defines how a component can be linearized (converted to text) 3 | """ 4 | 5 | from abc import ABC, abstractmethod 6 | from typing import Dict, List, Tuple 7 | 8 | from textractor.data.text_linearization_config import TextLinearizationConfig 9 | from textractor.data.html_linearization_config import HTMLLinearizationConfig 10 | from textractor.data.markdown_linearization_config import MarkdownLinearizationConfig 11 | 12 | class Linearizable(ABC): 13 | def get_text( 14 | self, config: TextLinearizationConfig = TextLinearizationConfig() 15 | ) -> str: 16 | """ 17 | Returns the linearized text of the entity 18 | 19 | :param config: Text linearization confi 20 | :type config: 21 | :return: Linearized text of the entity 22 | :rtype: str 23 | """ 24 | text, _ = self.get_text_and_words(config=config) 25 | return text 26 | 27 | @property 28 | def text(self) -> str: 29 | """ 30 | Maps to .get_text() 31 | 32 | :return: Returns the linearized text of the entity 33 | :rtype: str 34 | """ 35 | return self.get_text() 36 | 37 | def to_html( 38 | self, 39 | config: HTMLLinearizationConfig = HTMLLinearizationConfig() 40 | ) -> str: 41 | """ 42 | Returns the HTML representation of the entity 43 | 44 | :return: HTML text of the entity 45 | :rtype: str 46 | """ 47 | return self.get_text(config) 48 | 49 | def to_markdown( 50 | self, 51 | config: MarkdownLinearizationConfig = MarkdownLinearizationConfig() 52 | ) -> str: 53 | """ 54 | Returns the markdown representation of the entity 55 | 56 | :return: Markdown text of the entity 57 | :rtype: str 58 | """ 59 | return self.get_text(config) 60 | 61 | @abstractmethod 62 | def get_text_and_words( 63 | self, config: TextLinearizationConfig = TextLinearizationConfig() 64 | ) -> Tuple[str, List]: 65 | """ 66 | Used for linearization, returns the linearized text of the entity and the matching words 67 | 68 | :return: Tuple of text and word list 69 | :rtype: Tuple[str, List[Word]] 70 | """ 71 | pass 72 | -------------------------------------------------------------------------------- /textractor/entities/page_layout.py: -------------------------------------------------------------------------------- 1 | from textractor.entities.layout import Layout 2 | from textractor.visualizers.entitylist import EntityList 3 | 4 | 5 | class PageLayout: 6 | """ 7 | Object representation of the layout components detected in the table. 8 | """ 9 | 10 | def __init__( 11 | self, 12 | titles: EntityList[Layout] = EntityList([]), 13 | headers: EntityList[Layout] = EntityList([]), 14 | footers: EntityList[Layout] = EntityList([]), 15 | section_headers: EntityList[Layout] = EntityList([]), 16 | page_numbers: EntityList[Layout] = EntityList([]), 17 | lists: EntityList[Layout] = EntityList([]), 18 | figures: EntityList[Layout] = EntityList([]), 19 | tables: EntityList[Layout] = EntityList([]), 20 | key_values: EntityList[Layout] = EntityList([]), 21 | ): 22 | self._titles = titles 23 | self._headers = headers 24 | self._footers = footers 25 | self._section_headers = section_headers 26 | self._page_numbers = page_numbers 27 | self._lists = lists 28 | self._figures = figures 29 | self._tables = tables 30 | self._key_values = key_values 31 | 32 | @property 33 | def titles(self) -> EntityList[Layout]: 34 | """Titles detected in the Page 35 | 36 | :return: EntityList of titles detected in the page 37 | :rtype: EntityList[Layout] 38 | """ 39 | return self._titles 40 | 41 | @property 42 | def headers(self) -> EntityList[Layout]: 43 | """Headers detected in the Page 44 | 45 | :return: EntityList of headers detected in the page 46 | :rtype: EntityList[Layout] 47 | """ 48 | return self._headers 49 | 50 | @property 51 | def footers(self) -> EntityList[Layout]: 52 | """Footers detected in the Page 53 | 54 | :return: EntityList of footers detected in the page 55 | :rtype: EntityList[Layout] 56 | """ 57 | return self._footers 58 | 59 | @property 60 | def section_headers(self) -> EntityList[Layout]: 61 | """Section headers detected in the Page 62 | 63 | :return: EntityList of section headers detected in the page 64 | :rtype: EntityList[Layout] 65 | """ 66 | return self._section_headers 67 | 68 | @property 69 | def page_numbers(self) -> EntityList[Layout]: 70 | """Page numbers detected in the Page 71 | 72 | :return: EntityList of page numbers detected in the page 73 | :rtype: EntityList[Layout] 74 | """ 75 | return self._page_numbers 76 | 77 | @property 78 | def lists(self) -> EntityList[Layout]: 79 | """Lists detected in the Page 80 | 81 | :return: EntityList of lists detected in the page 82 | :rtype: EntityList[Layout] 83 | """ 84 | return self._lists 85 | 86 | @property 87 | def figures(self) -> EntityList[Layout]: 88 | """Figures detected in the Page 89 | 90 | :return: EntityList of figures detected in the page 91 | :rtype: EntityList[Layout] 92 | """ 93 | return self._figures 94 | 95 | @property 96 | def tables(self) -> EntityList[Layout]: 97 | """Tables detected in the Page. This includes Tables detected by the AnalyzeDocument Tables API if used. 98 | 99 | :return: EntityList of tables detected in the page 100 | :rtype: EntityList[Layout] 101 | """ 102 | return self._tables 103 | 104 | @property 105 | def key_values(self) -> EntityList[Layout]: 106 | """KeyValues detected in the Page 107 | 108 | :return: EntityList of keyvalues detected in the page 109 | :rtype: EntityList[Layout] 110 | """ 111 | return self._key_values 112 | -------------------------------------------------------------------------------- /textractor/entities/query.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :class:`KeyValue` entity is a document entity representing the Forms output. The key in :class:`KeyValue` are typically words 3 | and the :class:`Value` could be :class:`Word` elements or :class:`SelectionElement` in case of checkboxes. 4 | 5 | This class contains the associated metadata with the :class:`KeyValue` entity including the entity ID, 6 | bounding box information, value, existence of checkbox, page number, Page ID and confidence of detection. 7 | """ 8 | 9 | from typing import List, Optional, Tuple 10 | from textractor.data.text_linearization_config import TextLinearizationConfig 11 | 12 | from textractor.entities.query_result import QueryResult 13 | from textractor.entities.bbox import BoundingBox 14 | from textractor.entities.document_entity import DocumentEntity 15 | 16 | 17 | class Query(DocumentEntity): 18 | """ 19 | The Query object merges QUERY and QUERY_RESULT blocks. 20 | To create a new :class:`Query` object we require the following: 21 | 22 | :param entity_id: Unique identifier of the Query entity. 23 | :type entity_id: str 24 | :param bbox: Bounding box of the KeyValue entity. 25 | :type bbox: BoundingBox 26 | :param contains_checkbox: True/False to indicate if the value is a checkbox. 27 | :type contains_checkbox: bool 28 | :param value: Value object that maps to the KeyValue entity. 29 | :type value: Value 30 | :param confidence: confidence with which the entity was detected. 31 | :type confidence: float 32 | """ 33 | 34 | def __init__( 35 | self, 36 | entity_id: str, 37 | query: str, 38 | alias: str, 39 | query_result: Optional[QueryResult], 40 | result_bbox: Optional[BoundingBox], 41 | ): 42 | super().__init__(entity_id, result_bbox) 43 | 44 | self.query = query 45 | self.alias = alias 46 | self.result = query_result 47 | self._page = None 48 | self._page_id = None 49 | 50 | @property 51 | def page(self) -> int: 52 | """ 53 | :return: Returns the page number of the page the :class:`Table` entity is present in. 54 | :rtype: int 55 | """ 56 | return self._page 57 | 58 | @page.setter 59 | def page(self, page_num: int): 60 | """ 61 | Sets the page number attribute of the :class:`Table` entity. 62 | 63 | :param page_num: Page number where the Table entity exists. 64 | :type page_num: int 65 | """ 66 | self._page = page_num 67 | 68 | @property 69 | def page_id(self) -> str: 70 | """ 71 | :return: Returns the Page ID attribute of the page which the entity belongs to. 72 | :rtype: str 73 | """ 74 | return self._page_id 75 | 76 | @page_id.setter 77 | def page_id(self, page_id: str): 78 | """ 79 | Sets the Page ID of the :class:`Table` entity. 80 | 81 | :param page_id: Page ID of the page the entity belongs to. 82 | :type page_id: str 83 | """ 84 | self._page_id = page_id 85 | 86 | @property 87 | def has_result(self) -> bool: 88 | """ 89 | :return: Returns whether there was a result associated with the query 90 | :rtype: bool 91 | """ 92 | return self.result is not None 93 | 94 | def __repr__(self) -> str: 95 | """ 96 | :return: Returns Query object as a formatted string. 97 | :rtype: str 98 | """ 99 | 100 | if self.result: 101 | return f"{self.query} {self.result.answer}" 102 | else: 103 | return f"{self.query}" 104 | 105 | def get_text_and_words( 106 | self, config: TextLinearizationConfig = TextLinearizationConfig() 107 | ) -> Tuple[str, List]: 108 | """ 109 | Used for linearization, returns the linearized text of the Query and the matching words 110 | 111 | :return: Tuple of text and word list 112 | :rtype: Tuple[str, List[Word]] 113 | """ 114 | return f"{self.query} {self.result.answer}", [] 115 | -------------------------------------------------------------------------------- /textractor/entities/query_result.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :class:`KeyValue` entity is a document entity representing the Forms output. The key in :class:`KeyValue` are typically words 3 | and the :class:`Value` could be :class:`Word` elements or :class:`SelectionElement` in case of checkboxes. 4 | 5 | This class contains the associated metadata with the :class:`KeyValue` entity including the entity ID, 6 | bounding box information, value, existence of checkbox, page number, Page ID and confidence of detection. 7 | """ 8 | 9 | from typing import List, Tuple 10 | from textractor.data.text_linearization_config import TextLinearizationConfig 11 | from textractor.entities.bbox import BoundingBox 12 | from textractor.entities.document_entity import DocumentEntity 13 | 14 | 15 | class QueryResult(DocumentEntity): 16 | """ 17 | The QueryResult object represents QUERY_RESULT blocks. 18 | To create a new :class:`QueryResult` object we require the following: 19 | 20 | :param entity_id: Unique identifier of the Query entity. 21 | :type entity_id: str 22 | :param bbox: Bounding box of the QueryResult entity. 23 | :type bbox: BoundingBox 24 | :param contains_checkbox: True/False to indicate if the value is a checkbox. 25 | :type contains_checkbox: bool 26 | :param value: Value object that maps to the QueryResult entity. 27 | :type value: Value 28 | :param confidence: confidence with which the entity was detected. 29 | :type confidence: float 30 | """ 31 | 32 | def __init__( 33 | self, 34 | entity_id: str, 35 | confidence: float, 36 | result_bbox: BoundingBox, 37 | answer: str, 38 | ): 39 | super().__init__(entity_id, result_bbox) 40 | 41 | self.answer = answer 42 | self._confidence = confidence / 100 43 | self._page = None 44 | self._page_id = None 45 | 46 | @property 47 | def page(self) -> int: 48 | """ 49 | :return: Returns the page number of the page the :class:`Table` entity is present in. 50 | :rtype: int 51 | """ 52 | return self._page 53 | 54 | @page.setter 55 | def page(self, page_num: int): 56 | """ 57 | Sets the page number attribute of the :class:`Table` entity. 58 | 59 | :param page_num: Page number where the Table entity exists. 60 | :type page_num: int 61 | """ 62 | self._page = page_num 63 | 64 | @property 65 | def page_id(self) -> str: 66 | """ 67 | :return: Returns the Page ID attribute of the page which the entity belongs to. 68 | :rtype: str 69 | """ 70 | return self._page_id 71 | 72 | @page_id.setter 73 | def page_id(self, page_id: str): 74 | """ 75 | Sets the Page ID of the :class:`Table` entity. 76 | 77 | :param page_id: Page ID of the page the entity belongs to. 78 | :type page_id: str 79 | """ 80 | self._page_id = page_id 81 | 82 | def __repr__(self) -> str: 83 | """ 84 | :return: Returns Query object as a formatted string. 85 | :rtype: str 86 | """ 87 | 88 | return f"{self.answer}" 89 | 90 | def get_text_and_words( 91 | self, config: TextLinearizationConfig = TextLinearizationConfig() 92 | ) -> Tuple[str, List]: 93 | """ 94 | Used for linearization, returns the linearized text of the QueryResult and the matching words 95 | 96 | :return: Tuple of text and word list 97 | :rtype: Tuple[str, List[Word]] 98 | """ 99 | return self.answer, [] 100 | -------------------------------------------------------------------------------- /textractor/entities/selection_element.py: -------------------------------------------------------------------------------- 1 | """ 2 | Represents a single :class:`SelectionElement`/Checkbox/Clickable Entity within the :class:`Document`. 3 | 4 | This class contains the associated metadata with the :class:`SelectionElement` entity including the entity ID, 5 | bounding box information, selection status, page number, Page ID and confidence of detection. 6 | """ 7 | 8 | import uuid 9 | 10 | from typing import List 11 | from textractor.data.text_linearization_config import TextLinearizationConfig 12 | from textractor.entities.word import Word 13 | from textractor.entities.line import Line 14 | from textractor.entities.bbox import BoundingBox 15 | from textractor.data.constants import SELECTED, NOT_SELECTED, SelectionStatus 16 | from textractor.entities.document_entity import DocumentEntity 17 | 18 | 19 | class SelectionElement(DocumentEntity): 20 | """ 21 | To create a new :class:`SelectionElement` object we need the following: 22 | 23 | :param entity_id: Unique identifier of the SelectionElement entity. 24 | :type entity_id: str 25 | :param bbox: Bounding box of the SelectionElement 26 | :type bbox: BoundingBox 27 | :param status: SelectionStatus.SELECTED / SelectionStatus.NOT_SELECTED 28 | :type status: SelectionStatus 29 | :param confidence: Confidence with which this entity is detected. 30 | :type confidence: float 31 | """ 32 | 33 | def __init__( 34 | self, 35 | entity_id: str, 36 | bbox: BoundingBox, 37 | status: SelectionStatus, 38 | confidence: float = 0, 39 | ): 40 | super().__init__(entity_id, bbox) 41 | self.key_id = None 42 | self.value_id = None 43 | self.status = status 44 | self._confidence = confidence / 100 45 | self._page = None 46 | self._page_id = None 47 | 48 | def is_selected(self) -> bool: 49 | """ 50 | :return: Returns True / False depending on selection status of the SelectionElement. 51 | :rtype: bool 52 | """ 53 | return self.status == SelectionStatus.SELECTED 54 | 55 | @property 56 | def words(self) -> List[Word]: 57 | """ 58 | :return: Empty Word list as SelectionElement do not have words 59 | :rtype: EntityList[Word] 60 | """ 61 | return [] 62 | 63 | def get_text_and_words( 64 | self, config: TextLinearizationConfig = TextLinearizationConfig() 65 | ): 66 | w = Word( 67 | entity_id=str(uuid.uuid4()), 68 | bbox=self.bbox, 69 | text=config.selection_element_selected 70 | if self.status == SelectionStatus.SELECTED 71 | else config.selection_element_not_selected, 72 | ) 73 | w.is_clickable = True 74 | w.line = Line(entity_id=str(uuid.uuid4()), bbox=self.bbox, words=[w]) 75 | 76 | words = [w] 77 | 78 | text = w.text 79 | 80 | for w in words: 81 | w.value_id = str(self.id) 82 | w.value_bbox = self.bbox 83 | return text, words 84 | 85 | @property 86 | def page(self): 87 | """ 88 | :return: Returns the page number of the page the SelectionElement entity is present in. 89 | :rtype: int 90 | """ 91 | return self._page 92 | 93 | @page.setter 94 | def page(self, page_num: int): 95 | """ 96 | Sets the page number attribute of the SelectionElement entity. 97 | 98 | :param page_num: Page number where the SelectionElement entity exists. 99 | :type page_num: int 100 | """ 101 | self._page = page_num 102 | 103 | @property 104 | def page_id(self) -> str: 105 | """ 106 | :return: Returns the Page ID attribute of the page which the entity belongs to. 107 | :rtype: str 108 | """ 109 | return self._page_id 110 | 111 | @page_id.setter 112 | def page_id(self, page_id: str): 113 | """ 114 | Sets the Page ID of the SelectionElement entity. 115 | 116 | :param page_id: Page ID of the page the entity belongs to. 117 | :type page_id: str 118 | """ 119 | self._page_id = page_id 120 | 121 | def __repr__(self) -> str: 122 | """ 123 | Returns string representation of SelectionElement. 124 | """ 125 | if self.status == SelectionStatus.SELECTED: 126 | return "[X]" 127 | else: 128 | return "[ ]" 129 | -------------------------------------------------------------------------------- /textractor/entities/signature.py: -------------------------------------------------------------------------------- 1 | """ 2 | Represents a single :class:`Signature` Entity within the :class:`Document`. 3 | The Textract API response returns signatures as SIGNATURE BlockTypes. 4 | 5 | This class contains the associated metadata with the :class:`Signature` entity including the entity ID, 6 | bounding box information, page number, Page ID and confidence of detection. 7 | """ 8 | 9 | import uuid 10 | from textractor.data.text_linearization_config import TextLinearizationConfig 11 | 12 | from textractor.entities.bbox import BoundingBox 13 | from textractor.entities.document_entity import DocumentEntity 14 | from textractor.entities.line import Line 15 | from textractor.entities.word import Word 16 | 17 | 18 | class Signature(DocumentEntity): 19 | """ 20 | To create a new :class:`Signature` object we need the following: 21 | 22 | :param entity_id: Unique identifier of the signature entity. 23 | :type entity_id: str 24 | :param bbox: Bounding box of the signature entity. 25 | :type bbox: BoundingBox 26 | :param words: List of the Word entities present in the signature 27 | :type words: list, optional 28 | :param confidence: confidence with which the entity was detected. 29 | :type confidence: float, optional 30 | """ 31 | 32 | def __init__( 33 | self, 34 | entity_id: str, 35 | bbox: BoundingBox, 36 | confidence: float = 0, 37 | ): 38 | super().__init__(entity_id, bbox) 39 | self._confidence = confidence / 100 40 | self._page = None 41 | self._page_id = None 42 | 43 | @property 44 | def page(self): 45 | """ 46 | :return: Returns the page number of the page the :class:`Signature` entity is present in. 47 | :rtype: int 48 | """ 49 | return self._page 50 | 51 | @property 52 | def words(self): 53 | """ 54 | :return: Returns an empty list 55 | :rtype: list 56 | """ 57 | return [] 58 | 59 | @page.setter 60 | def page(self, page_num: int): 61 | """ 62 | Sets the page number attribute of the :class:`Signature` entity. 63 | 64 | :param page_num: Page number where the :class:`Signature` entity exists. 65 | :type page_num: int 66 | """ 67 | self._page = page_num 68 | 69 | @property 70 | def page_id(self) -> str: 71 | """ 72 | :return: Returns the Page ID attribute of the page which the entity belongs to. 73 | :rtype: str 74 | """ 75 | return self._page_id 76 | 77 | @page_id.setter 78 | def page_id(self, page_id: str): 79 | """ 80 | Sets the Page ID of the :class:`Signature` entity. 81 | 82 | :param page_id: Page ID of the page the entity belongs to. 83 | :type page_id: str 84 | """ 85 | self._page_id = page_id 86 | 87 | def get_text_and_words( 88 | self, config: TextLinearizationConfig = TextLinearizationConfig() 89 | ): 90 | w = Word( 91 | entity_id=str(uuid.uuid4()), bbox=self.bbox, text=config.signature_token 92 | ) 93 | w.line = Line(entity_id=str(uuid.uuid4()), bbox=self.bbox, words=[w]) 94 | return config.signature_token, [w] 95 | -------------------------------------------------------------------------------- /textractor/entities/table_footer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Represents a single :class:`TableFooter:class:` object. The `TableCell:class:` object contains information such as: 3 | 4 | * The position of the footer within the Document 5 | * The words that it contains 6 | * Confidence of entity detection 7 | """ 8 | 9 | from typing import List 10 | from textractor.data.text_linearization_config import TextLinearizationConfig 11 | from textractor.entities.bbox import BoundingBox 12 | from textractor.entities.document_entity import DocumentEntity 13 | from textractor.entities.word import Word 14 | from textractor.visualizers.entitylist import EntityList 15 | 16 | 17 | class TableFooter(DocumentEntity): 18 | """ 19 | Represents a footer that is either in-table or floating 20 | """ 21 | 22 | def __init__( 23 | self, 24 | entity_id: str, 25 | bbox: BoundingBox, 26 | ): 27 | super().__init__(entity_id, bbox) 28 | self._words: List[Word] = [] 29 | self._is_floating: bool = False 30 | self._page = None 31 | self._page_id = None 32 | 33 | @property 34 | def words(self): 35 | """ 36 | Returns all the Word objects present in the :class:`TableFooter`. 37 | 38 | :return words: List of Word objects, each representing a word within the TableFooter. 39 | :rtype: list 40 | """ 41 | return EntityList(self._words) 42 | 43 | @words.setter 44 | def words(self, words: List[Word]): 45 | """ 46 | Add Word objects to the :class:`TableFooter`. 47 | 48 | :param words: List of Word objects, each representing a word within the TableFooter. No specific ordering is assumed as it is ordered internally. 49 | :type words: list 50 | """ 51 | self._words = words 52 | 53 | @property 54 | def text(self) -> str: 55 | """Returns the text in the footer as one space-separated string 56 | 57 | :return: Text in the footer 58 | :rtype: str 59 | """ 60 | return " ".join([w.text for w in self.words]) 61 | 62 | @property 63 | def page(self): 64 | """ 65 | :return: Returns the page number of the page the TableFooter entity is present in. 66 | :rtype: int 67 | """ 68 | 69 | return self._page 70 | 71 | @page.setter 72 | def page(self, page_num: int): 73 | """ 74 | Sets the page number attribute of the TableFooter entity. 75 | 76 | :param page_num: Page number where the TableFooter entity exists. 77 | :type page_num: int 78 | """ 79 | 80 | self._page = page_num 81 | 82 | @property 83 | def page_id(self) -> str: 84 | """ 85 | :return: Returns the Page ID attribute of the page which the entity belongs to. 86 | :rtype: str 87 | """ 88 | 89 | return self._page_id 90 | 91 | @page_id.setter 92 | def page_id(self, page_id: str): 93 | """ 94 | Sets the Page ID of the TableFooter entity. 95 | 96 | :param page_id: Page ID of the page the entity belongs to. 97 | :type page_id: str 98 | """ 99 | 100 | self._page_id = page_id 101 | 102 | def get_text_and_words( 103 | self, config: TextLinearizationConfig = TextLinearizationConfig() 104 | ): 105 | return " ".join(self.words), self.words 106 | -------------------------------------------------------------------------------- /textractor/entities/table_title.py: -------------------------------------------------------------------------------- 1 | """ 2 | Represents a single :class:`TableTitle:class:` object. The `TableCell:class:` object contains information such as: 3 | 4 | * The position of the title within the Document 5 | * The words that it contains 6 | * Confidence of entity detection 7 | """ 8 | 9 | from typing import List 10 | from textractor.data.text_linearization_config import TextLinearizationConfig 11 | from textractor.entities.bbox import BoundingBox 12 | from textractor.entities.document_entity import DocumentEntity 13 | from textractor.entities.word import Word 14 | from textractor.utils.text_utils import linearize_children 15 | from textractor.visualizers.entitylist import EntityList 16 | 17 | 18 | class TableTitle(DocumentEntity): 19 | """ 20 | Represents a title that is either in-table or floating 21 | """ 22 | 23 | def __init__( 24 | self, 25 | entity_id: str, 26 | bbox: BoundingBox, 27 | ): 28 | super().__init__(entity_id, bbox) 29 | self._words: List[Word] = [] 30 | self._is_floating: bool = False 31 | self._page = None 32 | self._page_id = None 33 | 34 | @property 35 | def words(self): 36 | """ 37 | Returns all the Word objects present in the :class:`TableTitle`. 38 | 39 | :return words: List of Word objects, each representing a word within the TableTitle. 40 | :rtype: list 41 | """ 42 | return EntityList(self._words) 43 | 44 | @words.setter 45 | def words(self, words: List[Word]): 46 | """ 47 | Add Word objects to the :class:`TableTitle`. 48 | 49 | :param words: List of Word objects, each representing a word within the TableTitle. No specific ordering is assumed as it is ordered internally. 50 | :type words: list 51 | """ 52 | self._words = words 53 | 54 | @property 55 | def text(self) -> str: 56 | """Returns the text in the title as one space-separated string 57 | 58 | :return: Text in the title 59 | :rtype: str 60 | """ 61 | return " ".join([w.text for w in self.words]) 62 | 63 | @property 64 | def page(self): 65 | """ 66 | :return: Returns the page number of the page the TableTitle entity is present in. 67 | :rtype: int 68 | """ 69 | 70 | return self._page 71 | 72 | @page.setter 73 | def page(self, page_num: int): 74 | """ 75 | Sets the page number attribute of the TableTitle entity. 76 | 77 | :param page_num: Page number where the TableTitle entity exists. 78 | :type page_num: int 79 | """ 80 | 81 | self._page = page_num 82 | 83 | @property 84 | def page_id(self) -> str: 85 | """ 86 | :return: Returns the Page ID attribute of the page which the entity belongs to. 87 | :rtype: str 88 | """ 89 | 90 | return self._page_id 91 | 92 | @page_id.setter 93 | def page_id(self, page_id: str): 94 | """ 95 | Sets the Page ID of the TableTitle entity. 96 | 97 | :param page_id: Page ID of the page the entity belongs to. 98 | :type page_id: str 99 | """ 100 | 101 | self._page_id = page_id 102 | 103 | @property 104 | def is_floating(self) -> bool: 105 | """ 106 | :return: Returns whether the TableTitle entity is floating or not. 107 | :rtype: bool 108 | """ 109 | 110 | return self._is_floating 111 | 112 | @is_floating.setter 113 | def is_floating(self, is_floating: bool): 114 | """ 115 | Sets the is_floating attribute of the TableTitle entity. 116 | 117 | :param is_floating: Whether the title is floating (not in-table) or not (in-table). 118 | :type is_floating: bool 119 | """ 120 | 121 | self._is_floating = is_floating 122 | 123 | def get_text_and_words( 124 | self, config: TextLinearizationConfig = TextLinearizationConfig() 125 | ): 126 | text, words = linearize_children(self.words, config=config) 127 | return text, words 128 | -------------------------------------------------------------------------------- /textractor/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Define exceptions specific to textractor. 3 | """ 4 | 5 | 6 | class RegionMismatchError(Exception): 7 | """Raised when region on the profile_name does not match the region of the S3 bucket being accessed.""" 8 | 9 | pass 10 | 11 | 12 | class NoImageException(Exception): 13 | """Raised when visualize() method is called without saving image during Textract API call.""" 14 | 15 | pass 16 | 17 | 18 | class InputError(Exception): 19 | """Raised when function inputs are incorrect.""" 20 | 21 | 22 | class EntityListCreationError(Exception): 23 | """Raised when EntityList is created without passing any object or list of objects.""" 24 | 25 | pass 26 | 27 | 28 | class InvalidProfileNameError(Exception): 29 | """Raised when profile_name passed to Textractor is invalid.""" 30 | 31 | pass 32 | 33 | 34 | class S3FilePathMissing(Exception): 35 | """Raised when s3 file path is missing.""" 36 | 37 | pass 38 | 39 | 40 | class MissingDependencyException(Exception): 41 | """Raised when a dependency is missing for a specific code path""" 42 | 43 | pass 44 | 45 | 46 | class IncorrectMethodException(Exception): 47 | """Raised when wrong endpoint is called.""" 48 | 49 | pass 50 | 51 | 52 | class UnhandledCaseException(Exception): 53 | """Raised when no statement matched the condition""" 54 | 55 | pass 56 | 57 | class UnsupportedDocumentException(Exception): 58 | """Raised by the Textract API when the document could not be processed""" 59 | 60 | pass 61 | 62 | class InvalidS3ObjectException(Exception): 63 | """Raised by the Textract API when an S3 object could not be accessed""" 64 | 65 | pass 66 | -------------------------------------------------------------------------------- /textractor/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/parsers/__init__.py -------------------------------------------------------------------------------- /textractor/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/utils/__init__.py -------------------------------------------------------------------------------- /textractor/utils/geometry_util.py: -------------------------------------------------------------------------------- 1 | import statistics 2 | from typing import List 3 | from copy import deepcopy 4 | from collections.abc import Iterable 5 | 6 | 7 | def flatten(list_of_lists): 8 | """ 9 | Utility function to flatten a list of lists recursively. 10 | 11 | :param list_of_lists: List containing any depth of lists recursively to be flattened into a single list. 12 | :type list_of_lists: list 13 | :return: Flattened list of input list 14 | :rtype: list 15 | """ 16 | for x in list_of_lists: 17 | if isinstance(x, Iterable): 18 | yield from flatten(x) 19 | else: 20 | yield x 21 | 22 | 23 | def get_indices(numpy_indexing: str = ":", max_val=10) -> List[int]: 24 | """ 25 | Function to convert numpy indexing format to list of indices to access cells within the Table. 26 | 27 | :param numpy_indexing: string containing start:stop:step format 28 | :param max_val: maximum rows or columns on the table depending on input. 29 | :return: Returns the indices of table rows and columns following the numpy indexing format. 30 | :rtype: list 31 | """ 32 | indices = [] 33 | assert isinstance(numpy_indexing, str) 34 | assert ":" in numpy_indexing or numpy_indexing.isdigit() 35 | 36 | if numpy_indexing == ":": 37 | numpy_indexing = "None:None" 38 | if numpy_indexing == "None": 39 | numpy_indexing = "None:None" 40 | 41 | if ":" not in numpy_indexing: 42 | if int(numpy_indexing) > max_val: 43 | raise IndexError() 44 | return [int(numpy_indexing)] 45 | 46 | if numpy_indexing == "None:None:None": 47 | indices = list(range(0, max_val)) 48 | 49 | else: 50 | return_indices = numpy_indexing.split(":") 51 | assert len(return_indices) > 1 52 | 53 | start = ( 54 | int(return_indices[0]) 55 | if return_indices[0] != "" and return_indices[0] != "None" 56 | else 0 57 | ) 58 | 59 | if start < 0: 60 | start = max_val + start 61 | 62 | end = ( 63 | int(return_indices[1]) 64 | if return_indices[1] != "" and return_indices[1] != "None" 65 | else max_val 66 | ) 67 | 68 | if end < 0: 69 | end = max_val + end 70 | 71 | index_range = list(range(start, end)) 72 | 73 | if len(return_indices) == 3: 74 | step = ( 75 | int(return_indices[2]) 76 | if return_indices[2] != "" and return_indices[2] != "None" 77 | else 1 78 | ) 79 | indices += [i for i in index_range if index_range.index(i) % step == 0] 80 | else: 81 | indices = index_range 82 | 83 | return list(set(indices)) 84 | 85 | 86 | def sort_by_position(entities: List) -> List: 87 | return sorted(entities, key=lambda e: (e.bbox.y + e.bbox.height, e.bbox.x)) 88 | -------------------------------------------------------------------------------- /textractor/utils/html_utils.py: -------------------------------------------------------------------------------- 1 | import html 2 | from textractor.data.html_linearization_config import HTMLLinearizationConfig 3 | 4 | def add_id_to_html_tag(prefix, id, config): 5 | if not isinstance(config, HTMLLinearizationConfig) or not prefix: 6 | return prefix 7 | if config.add_ids_to_html_tags: 8 | return prefix[:-1] + f' id="{id}"' + prefix[-1] 9 | elif config.add_short_ids_to_html_tags: 10 | return prefix[:-1] + f' id="{id[:8]}"' + prefix[-1] 11 | else: 12 | return prefix 13 | 14 | def escape_text(text, config): 15 | if not isinstance(config, HTMLLinearizationConfig): 16 | return text 17 | else: 18 | return html.escape(text) 19 | -------------------------------------------------------------------------------- /textractor/utils/legacy_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from textractor.data.constants import ( 3 | LAYOUT_FIGURE, 4 | LAYOUT_LIST, 5 | LAYOUT_TABLE, 6 | LAYOUT_KEY_VALUE, 7 | LAYOUT_TEXT, 8 | LAYOUT_TITLE, 9 | LAYOUT_HEADER, 10 | LAYOUT_FOOTER, 11 | LAYOUT_SECTION_HEADER, 12 | LAYOUT_PAGE_NUMBER, 13 | ) 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | def converter(response): 18 | blocks_to_delete = [] 19 | page_blocks = [] 20 | try: 21 | for i, block in enumerate(response["Blocks"]): 22 | if block.get("BlockType") == "PAGE": 23 | page_blocks.append(block) 24 | elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"): 25 | block["BlockType"] = LAYOUT_TEXT 26 | elif ( 27 | block.get("BlockType", "").startswith("LAYOUT_") and 28 | block.get("BlockType") not in [ 29 | LAYOUT_TEXT, 30 | LAYOUT_TITLE, 31 | LAYOUT_HEADER, 32 | LAYOUT_FOOTER, 33 | LAYOUT_SECTION_HEADER, 34 | LAYOUT_PAGE_NUMBER, 35 | LAYOUT_LIST, 36 | LAYOUT_FIGURE, 37 | LAYOUT_TABLE, 38 | LAYOUT_KEY_VALUE, 39 | ] 40 | ): 41 | block["BlockType"] = LAYOUT_FIGURE 42 | elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []): 43 | blocks_to_delete.append((i, block)) 44 | 45 | blocks_to_delete_id_set = set([b["Id"] for _, b in blocks_to_delete]) 46 | for page_block in page_blocks: 47 | for relationship in page_block.get("Relationships", []): 48 | if relationship["Type"] == "CHILD": 49 | relationship["Ids"] = [ 50 | id 51 | for id in relationship["Ids"] 52 | if id not in blocks_to_delete_id_set 53 | ] 54 | break 55 | 56 | for i, block in blocks_to_delete[::-1]: 57 | del response["Blocks"][i] 58 | except Exception as ex: 59 | logger.warning(f"Failed to convert the response for backward compatibility. {str(ex)}") 60 | 61 | return response 62 | -------------------------------------------------------------------------------- /textractor/utils/pdf_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Union 3 | from PIL import Image 4 | 5 | try: 6 | import pypdfium2 7 | PYPDFIUM2_IS_INSTALLED = True 8 | except ImportError: 9 | PYPDFIUM2_IS_INSTALLED = False 10 | 11 | try: 12 | from pdf2image import convert_from_bytes, convert_from_path, pdfinfo_from_bytes, pdfinfo_from_path 13 | PDF2IMAGE_IS_INSTALLED = True 14 | except ImportError: 15 | PDF2IMAGE_IS_INSTALLED = False 16 | 17 | 18 | def rasterize_pdf(pdf: Union[str, bytes]) -> List[Image.Image]: 19 | """ 20 | Convert a pdf into a list of images 21 | """ 22 | if PYPDFIUM2_IS_INSTALLED: 23 | pdf = pypdfium2.PdfDocument(pdf) 24 | return [page.render(scale=250 / 72).to_pil() for page in pdf] 25 | elif PDF2IMAGE_IS_INSTALLED: 26 | if isinstance(pdf, str): 27 | return convert_from_path(pdf, dpi=250, fmt="jpeg") 28 | elif isinstance(pdf, bytes): 29 | return convert_from_bytes(pdf, dpi=250, fmt="jpeg") 30 | else: 31 | raise Exception(f"{type(pdf)} is not a supported type, should be str or bytes") 32 | else: 33 | raise Exception("PDF rasterization is not possible if neither pypdfium2 nor pdf2image are installed") -------------------------------------------------------------------------------- /textractor/utils/results_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import boto3 3 | import os 4 | import json 5 | import datetime 6 | from textractcaller.t_call import get_s3_output_config_keys, OutputConfig, remove_none 7 | 8 | 9 | def results_exist(job_id: str, s3_bucket: str, s3_prefix: str, s3_client=None) -> bool: 10 | if not s3_client: 11 | s3_client = boto3.client("s3") 12 | response = s3_client.list_objects( 13 | Bucket=s3_bucket, 14 | Prefix=os.path.join(s3_prefix, job_id + "/"), 15 | Delimiter="/", 16 | MaxKeys=2, 17 | ) 18 | # The directory will have at least one file because of the S3 access check 19 | return "Contents" in response and len(response["Contents"]) > 1 20 | 21 | def get_full_json_from_output_config( 22 | output_config: OutputConfig, job_id: str, s3_client=None 23 | ) -> dict: 24 | if not output_config or not job_id: 25 | raise ValueError("no output_config or job_id") 26 | if not output_config.s3_bucket or not output_config.s3_prefix: 27 | raise ValueError("no output_config or job_id") 28 | if not s3_client: 29 | s3_client = boto3.client("s3") 30 | 31 | result_value = dict() 32 | last_result = None 33 | parsed_keys = set() 34 | while last_result is None or (datetime.datetime.now().astimezone() - last_result).total_seconds() < 5: 35 | keys = get_s3_output_config_keys( 36 | output_config=output_config, job_id=job_id, s3_client=s3_client 37 | ) 38 | for key in keys: 39 | if key in parsed_keys: 40 | continue 41 | parsed_keys.add(key) 42 | s3_object = s3_client.get_object(Bucket=output_config.s3_bucket, Key=key) 43 | if last_result is None: 44 | last_result = s3_object["LastModified"] 45 | else: 46 | last_result = max(last_result, s3_object["LastModified"]) 47 | body = s3_object["Body"] 48 | body_read = body.read() 49 | body_decode = body_read.decode("utf-8") 50 | response = dict(json.loads(body_decode)) 51 | if "Blocks" in result_value: 52 | result_value["Blocks"].extend(response["Blocks"]) 53 | else: 54 | result_value = response 55 | result_value = remove_none(result_value) 56 | return result_value 57 | -------------------------------------------------------------------------------- /textractor/utils/s3_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple 2 | from io import BytesIO 3 | from PIL import Image 4 | 5 | from textractor.exceptions import InputError 6 | 7 | 8 | def s3_path_to_bucket_and_prefix(s3_path: str) -> Tuple[str, str]: 9 | """Converts an S3 URI to a bucket and prefix 10 | 11 | :param s3_path: S3 URI 12 | :type s3_path: str 13 | :raises InputError: Raised if the given path cannot be parsed 14 | :return: Tuple of bucket and prefix as string 15 | :rtype: Tuple[str, str] 16 | """ 17 | try: 18 | bucket, prefix = s3_path.replace("s3://", "").split("/", 1) 19 | except IndexError: 20 | raise InputError(f"Could not parse {s3_path} as ") 21 | return bucket, prefix 22 | 23 | 24 | def download_from_s3(client, s3_path: str, **extra_args): 25 | """Downloads a file from S3 and returns it as a BytesIO object 26 | 27 | :param client: S3 client 28 | :type client: Client 29 | :param s3_path: S3 path to download 30 | :type s3_path: str 31 | :return: S3 file as a BytesIO object 32 | :rtype: BytesIO 33 | """ 34 | 35 | bucket, prefix = s3_path_to_bucket_and_prefix(s3_path) 36 | 37 | f = BytesIO() 38 | client.download_fileobj(bucket, prefix, f) 39 | f.seek(0) 40 | return f 41 | 42 | 43 | def upload_to_s3( 44 | client, s3_path: str, file_source: Union[str, bytes, Image.Image], **extra_args 45 | ): 46 | """Upload a file to S3 47 | 48 | :param client: boto3 client 49 | :type client: Client 50 | :param s3_path: S3 path to upload to 51 | :type s3_path: str 52 | :param file_source: File to upload 53 | :type file_source: Union[str, bytes, Image.Image] 54 | :raises InputError: Raised if the file_source is not of type str, bytes or Image 55 | """ 56 | bucket, prefix = s3_path_to_bucket_and_prefix(s3_path) 57 | if isinstance(file_source, Image.Image): 58 | fake_file = BytesIO() 59 | file_source.save(fake_file, format="PNG") 60 | fake_file.seek(0) 61 | client.upload_fileobj(fake_file, bucket, prefix, extra_args) 62 | elif isinstance(file_source, bytes): 63 | fake_file = BytesIO(file_source) 64 | client.upload_fileobj(fake_file, bucket, prefix, extra_args) 65 | elif isinstance(file_source, str): 66 | client.upload_file(file_source, bucket, prefix, extra_args) 67 | else: 68 | raise InputError( 69 | f"{file_source} must be of type str or bytes, not {type(file_source)}" 70 | ) 71 | 72 | 73 | def delete_from_s3(client, s3_path: str): 74 | """Delete a file from S3 75 | 76 | :param client: boto3 client 77 | :type client: Client 78 | :param s3_path: S3 path to the object to delete 79 | :type s3_path: str 80 | """ 81 | bucket, prefix = s3_path_to_bucket_and_prefix(s3_path) 82 | 83 | client.delete_object(bucket, prefix) 84 | -------------------------------------------------------------------------------- /textractor/utils/search_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for Document search""" 2 | 3 | try: 4 | import numpy as np 5 | except ImportError: 6 | # No need to log it here as numpy is only used if SentenceTransformers is used 7 | # The latter has numpy as dependency. 8 | pass 9 | 10 | import math 11 | import editdistance 12 | from textractor.data.constants import SimilarityMetric 13 | from textractor.exceptions import MissingDependencyException 14 | 15 | 16 | from textractor.data.constants import ( 17 | IS_COLUMN_HEAD, 18 | IS_FOOTER_CELL, 19 | IS_TITLE_CELL, 20 | IS_SUMMARY_CELL, 21 | IS_SECTION_TITLE_CELL, 22 | CellTypes, 23 | ) 24 | 25 | 26 | class SearchUtils: 27 | model = None 28 | util = None 29 | model_string = "all-MiniLM-L6-v2" 30 | 31 | @classmethod 32 | def get_word_similarity( 33 | cls, word_1: str, word_2: str, similarity_metric: SimilarityMetric 34 | ) -> float: 35 | """ 36 | Returns the extent of similarity between the input words using the similarity_metric input by the user. 37 | 38 | :param word_1: First word to check for similarity 39 | :type word_1: str 40 | :param word_2: Second word to check for similarity 41 | :type word_2: str 42 | :param similarity_metric: The function supports one of 3 metrics \ 43 | * Levenshtein distance/ edit distance \ 44 | * Euclidean distance \ 45 | * Cosine distance 46 | :type similarity_metric: str 47 | 48 | :return: Returns the similarity measure calculated based on the metric for the 2 input words. 49 | :rtype: float 50 | """ 51 | if cls.model is None and similarity_metric != SimilarityMetric.LEVENSHTEIN: 52 | try: 53 | from sentence_transformers import SentenceTransformer, util 54 | except ImportError: 55 | raise MissingDependencyException( 56 | "sentence_transformers is not installed. Use SimilarityMetric.LEVENSHTEIN." 57 | ) 58 | cls.model = SentenceTransformer(cls.model_string) 59 | cls.util = util 60 | 61 | if similarity_metric == SimilarityMetric.LEVENSHTEIN: 62 | return normalized_edit_distance(word_1.lower(), word_2.lower()) 63 | elif similarity_metric == SimilarityMetric.EUCLIDEAN: 64 | ref_word_emb = cls.model.encode([word_1]) 65 | word_emb = cls.model.encode([word_2]) 66 | dist = np.linalg.norm(ref_word_emb - word_emb) 67 | return dist 68 | else: 69 | ref_word_emb = cls.model.encode([word_1]) 70 | word_emb = cls.model.encode([word_2]) 71 | similarity = cls.util.cos_sim(ref_word_emb, word_emb) 72 | return similarity.item() 73 | 74 | 75 | def jaccard_similarity(list_1: list, list_2: list) -> float: 76 | """ 77 | Calculates Jaccard similarity between the 2 input lists. 78 | 79 | :param list_1: First list to check for similarity 80 | :type list_1: list 81 | :param list_2: Second list to check for similarity 82 | :type list_2: list 83 | 84 | :return: Returns the similarity measure calculated for the 2 input lists. 85 | :rtype: float 86 | """ 87 | 88 | set_1 = set(list_1) 89 | set_2 = set(list_2) 90 | return float(len(set_1.intersection(set_2)) / len(set_1.union(set_2))) 91 | 92 | 93 | def get_metadata_attr_name(cell_atr): 94 | """ 95 | Returns metadata attribute mapping to the input CellType. 96 | 97 | :param cell_atr: Input cell type 98 | :type: enum 99 | :return: Returns metadata attribute mapping to the input CellType. 100 | :rtype: str 101 | """ 102 | cell_map = { 103 | CellTypes.COLUMN_HEADER: IS_COLUMN_HEAD, 104 | CellTypes.SECTION_TITLE: IS_SECTION_TITLE_CELL, 105 | CellTypes.SUMMARY_CELL: IS_SUMMARY_CELL, 106 | CellTypes.FLOATING_TITLE: IS_TITLE_CELL, 107 | CellTypes.FLOATING_FOOTER: IS_FOOTER_CELL, 108 | } 109 | try: 110 | return cell_map[cell_atr] 111 | except: 112 | return "" 113 | 114 | 115 | def normalized_edit_distance(s1: str, s2: str): 116 | """ 117 | Returns the normalized edit distance 118 | 119 | :param s1: First string 120 | :type s1: str 121 | :param s2: Second string 122 | :type s2: str 123 | """ 124 | 125 | dist = editdistance.eval(s1, s2) 126 | max_length = max(len(s1), len(s2)) 127 | if max_length - dist == 0: 128 | return 0.0 129 | return (max_length - dist) / max_length 130 | -------------------------------------------------------------------------------- /textractor/visualizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .entitylist import EntityList 2 | -------------------------------------------------------------------------------- /textractor/visualizers/arial.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/visualizers/arial.ttf -------------------------------------------------------------------------------- /tpipelinegeofinder/Manifest.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | 3 | recursive-exclude * __pycache__ 4 | recursive-exclude * *.py[co] test_*.py 5 | 6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif -------------------------------------------------------------------------------- /tpipelinegeofinder/setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.0.8 3 | commit = False 4 | tag = False 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:textractgeofinder/_version.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | -------------------------------------------------------------------------------- /tpipelinegeofinder/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | 10 | requirements = ['amazon-textract-response-parser>=0.1.17'] 11 | 12 | if sys.argv[-1] == 'publish-test': 13 | os.system(f"cd {os.path.dirname(__file__)}") 14 | os.system('rm -rf dist/ build/ amazon_textract_geofinder.egg-info') 15 | os.system('python setup.py sdist bdist_wheel') 16 | os.system('twine check dist/*') 17 | os.system('twine upload --repository pypitest dist/*') 18 | sys.exit() 19 | 20 | if sys.argv[-1] == 'publish': 21 | os.system(f"cd {os.path.dirname(__file__)}") 22 | os.system('rm -rf dist/ build/ amazon_textract_geofinder.egg-info/') 23 | os.system('python setup.py sdist bdist_wheel') 24 | os.system('twine check dist/*') 25 | os.system('twine upload --repository pypi dist/*') 26 | sys.exit() 27 | 28 | setup(name='amazon-textract-geofinder', 29 | packages=find_packages(exclude=['tests']), 30 | include_package_data=True, 31 | exclude_package_data={"": ["test_*.py", "__pycache__"]}, 32 | version='0.0.9', 33 | description='Amazon Textract package to easier access data through geometric information', 34 | install_requires=requirements, 35 | scripts=['bin/amazon-textract-geofinder'], 36 | long_description_content_type='text/markdown', 37 | long_description=read('README.md'), 38 | author='Amazon Rekognition Textract Demoes', 39 | author_email='rekognition-textract-demos@amazon.com', 40 | url='https://github.com/aws-samples/amazon-textract-textractor/tpipelinegeofinder', 41 | keywords='amazon-textract-textractor amazon textract finder geometry geo', 42 | license="Apache License Version 2.0", 43 | classifiers=[ 44 | "Development Status :: 4 - Beta", 45 | "Topic :: Utilities", 46 | 'License :: OSI Approved :: Apache Software License', 47 | 'Programming Language :: Python :: 3.6', 48 | 'Programming Language :: Python :: 3.7', 49 | 'Programming Language :: Python :: 3.8', 50 | 'Programming Language :: Python :: 3.9', 51 | 'Programming Language :: Python :: 3.10', 52 | 'Programming Language :: Python :: 3.11', 53 | 'Programming Language :: Python :: 3.12', 54 | ], 55 | python_requires='>=3.6') 56 | -------------------------------------------------------------------------------- /tpipelinegeofinder/tests/data/multi_page_example_file.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tpipelinegeofinder/tests/data/multi_page_example_file.pdf -------------------------------------------------------------------------------- /tpipelinegeofinder/tests/data/patient_intake_form_sample.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tpipelinegeofinder/tests/data/patient_intake_form_sample.jpg -------------------------------------------------------------------------------- /tpipelinegeofinder/tests/test_ocrdb.py: -------------------------------------------------------------------------------- 1 | from textractgeofinder.ocrdb import OCRDB 2 | from textractgeofinder.tword import TWord 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | def test_creation(caplog): 9 | caplog.set_level(logging.DEBUG) 10 | ocrdb = OCRDB.getInstance() 11 | tword: TWord = TWord(text='sometext', 12 | original_text='SomeText', 13 | text_type='word', 14 | confidence=71.7424087524414, 15 | id='e5d9a27b-483c-4c8b-9d09-4092d050e2e4', 16 | xmin=100, 17 | ymin=0, 18 | xmax=263, 19 | ymax=22, 20 | page_number=1, 21 | doc_width=1080, 22 | doc_height=1920, 23 | child_relationships='', 24 | reference=None, 25 | resolver=None) 26 | ocrdb.insert(textract_doc_uuid='bla', x=tword) 27 | logger.debug(f"tword: {tword}") 28 | -------------------------------------------------------------------------------- /tpipelinegeofinder/tests/test_tword.py: -------------------------------------------------------------------------------- 1 | from textractgeofinder.tword import TWord 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | def test_creation(caplog): 8 | caplog.set_level(logging.DEBUG) 9 | tword: TWord = TWord(text="test", 10 | text_type="text_type", 11 | confidence=99, 12 | id="test-id", 13 | page_number=1, 14 | ymin=1, 15 | ymax=1, 16 | xmin=10, 17 | xmax=10, 18 | original_text="original-text", 19 | doc_width=100, 20 | doc_height=100, 21 | reference="test") 22 | logger.debug(f"tword: {tword}") 23 | -------------------------------------------------------------------------------- /tpipelinegeofinder/textractgeofinder/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | 3 | import logging 4 | from logging import NullHandler 5 | 6 | logging.getLogger(__name__).addHandler(NullHandler()) 7 | -------------------------------------------------------------------------------- /tpipelinegeofinder/textractgeofinder/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.8' 2 | -------------------------------------------------------------------------------- /tpipelinegeofinder/textractgeofinder/tinterface.py: -------------------------------------------------------------------------------- 1 | from textractquery.tword import TWord 2 | from typing import Optional 3 | from enum import Enum 4 | 5 | 6 | class Direction(Enum): 7 | UP = 1 8 | RIGHT = 2 9 | DOWN = 3 10 | LEFT = 4 11 | 12 | 13 | class TInterface: 14 | pass 15 | -------------------------------------------------------------------------------- /tpipelinepagedimensions/Manifest.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | 3 | recursive-exclude * __pycache__ 4 | recursive-exclude * *.py[co] test_*.py 5 | 6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif -------------------------------------------------------------------------------- /tpipelinepagedimensions/README.md: -------------------------------------------------------------------------------- 1 | # Textract-Pipeline-PageDimensions 2 | 3 | Provides functions to add page dimensions with doc_width and doc_height to the Textract JSON schema for the PAGE blocks under the custom attribute in the form of: 4 | 5 | e. g. 6 | 7 | ``` 8 | {'PageDimension': {'doc_width': 1549.0, 'doc_height': 370.0} } 9 | ``` 10 | 11 | # Install 12 | 13 | ```bash 14 | > python -m pip install amazon-textract-pipeline-pagedimensions 15 | ``` 16 | 17 | Make sure your environment is setup with AWS credentials through configuration files or environment variables or an attached role. (https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) 18 | 19 | # Samples 20 | 21 | ## Add Page dimensions for a local file 22 | 23 | sample uses amazon-textract-caller amazon-textract-pipeline-pagedimensions 24 | 25 | ```bash 26 | python -m pip install amazon-textract-caller 27 | ``` 28 | 29 | ```python 30 | from textractpagedimensions.t_pagedimensions import add_page_dimensions 31 | from textractcaller.t_call import call_textract 32 | from trp.trp2 import TDocument, TDocumentSchema 33 | 34 | j = call_textract(input_document='') 35 | t_document: TDocument = TDocumentSchema().load(j) 36 | add_page_dimensions(t_document=t_document, input_document=input_file) 37 | print(t_document.pages[0].custom['PageDimension']) 38 | # output will be something like this: 39 | # { 40 | # 'doc_width': 1544, 41 | # 'doc_height': 1065 42 | # } 43 | ``` 44 | 45 | ## Using the Amazon Textact Helper command line tool with PageDimensions 46 | 47 | Together with the Amazon Textract Helper and Amazon Textract Response Parser, we can build a pipeline that includes information about PageDimension and Orientation of pages 48 | as a short demonstration on the information that is added to the Textract JSON. 49 | 50 | ```bash 51 | > python -m pip install amazon-textract-helper amazon-textract-response-parser amazon-textract-pipeline-pagedimensions 52 | > amazon-textract --input-document "s3://amazon-textract-public-content/blogs/2-pager-different-dimensions.pdf" | amazon-textract-pipeline-pagedimensions --input-document "s3://amazon-textract-public-content/blogs/2-pager-different-dimensions.pdf" | amazon-textract-pipeline --components add_page_orientation | jq '.Blocks[] | select(.BlockType=="PAGE") | .Custom' 53 | 54 | { 55 | "PageDimension": { 56 | "doc_width": 1549, 57 | "doc_height": 370 58 | }, 59 | "Orientation": 0 60 | } 61 | { 62 | "PageDimension": { 63 | "doc_width": 1079, 64 | "doc_height": 505 65 | }, 66 | "Orientation": 0 67 | } 68 | ``` 69 | -------------------------------------------------------------------------------- /tpipelinepagedimensions/setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.0.9 3 | commit = False 4 | tag = False 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:textractpagedimensions/_version.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | -------------------------------------------------------------------------------- /tpipelinepagedimensions/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | 6 | def read(fname): 7 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 8 | 9 | 10 | requirements = ['boto3', 'botocore', 'Pillow', 'pypdf>=3.1,<5.0'] 11 | 12 | if sys.argv[-1] == 'publish-test': 13 | os.system(f"cd {os.path.dirname(__file__)}") 14 | os.system('rm -rf dist/ build/ amazon_textract_pipeline_pagedimensions.egg-info/') 15 | os.system('python setup.py sdist bdist_wheel') 16 | os.system('twine check dist/*') 17 | os.system('twine upload --repository pypitest dist/*') 18 | sys.exit() 19 | 20 | if sys.argv[-1] == 'publish': 21 | os.system(f"cd {os.path.dirname(__file__)}") 22 | os.system('rm -rf dist/ build/ amazon_textract_pipeline_pagedimensions.egg-info/') 23 | os.system('python setup.py sdist bdist_wheel') 24 | os.system('twine check dist/*') 25 | os.system('twine upload --repository pypi dist/*') 26 | sys.exit() 27 | 28 | setup(name='amazon-textract-pipeline-pagedimensions', 29 | packages=find_packages(exclude=['tests']), 30 | include_package_data=True, 31 | exclude_package_data={"": ["test_*.py", "__pycache__"]}, 32 | version='0.0.10', 33 | description='Amazon Textract Pipeline Component to add page dimensions to page block types', 34 | install_requires=requirements, 35 | scripts=['bin/amazon-textract-pipeline-pagedimensions'], 36 | long_description_content_type='text/markdown', 37 | long_description=read('README.md'), 38 | author='Amazon Rekognition Textract Demoes', 39 | author_email='rekognition-textract-demos@amazon.com', 40 | url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/tpipelinepagedimensions', 41 | keywords='amazon-textract-textractor amazon textract textractor pipeline page dimensions', 42 | license="Apache License Version 2.0", 43 | classifiers=[ 44 | "Development Status :: 4 - Beta", 45 | "Topic :: Utilities", 46 | 'License :: OSI Approved :: Apache Software License', 47 | 'Programming Language :: Python :: 3.6', 48 | 'Programming Language :: Python :: 3.7', 49 | 'Programming Language :: Python :: 3.8', 50 | 'Programming Language :: Python :: 3.9', 51 | 'Programming Language :: Python :: 3.10', 52 | 'Programming Language :: Python :: 3.11', 53 | 'Programming Language :: Python :: 3.12', 54 | ], 55 | python_requires='>=3.6') 56 | -------------------------------------------------------------------------------- /tpipelinepagedimensions/tests/data/Textract-orginal-2021-05-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tpipelinepagedimensions/tests/data/Textract-orginal-2021-05-10.png -------------------------------------------------------------------------------- /tpipelinepagedimensions/tests/test_pagedimensions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | import logging 4 | 5 | from typing import List 6 | from textractpagedimensions.t_pagedimensions import add_page_dimensions 7 | from textractcaller.t_call import call_textract 8 | from trp.trp2 import TDocument, TDocumentSchema, TBlock 9 | 10 | 11 | def test_dimensions_from_file(): 12 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 13 | input_file = os.path.join(SCRIPT_DIR, "data/Textract-orginal-2021-05-10.png") 14 | j = call_textract(input_document=input_file) 15 | t_document: TDocument = TDocumentSchema().load(j) 16 | add_page_dimensions(t_document=t_document, input_document=input_file) 17 | assert t_document.pages[0].custom['PageDimension'] == {'doc_width': 1544, 'doc_height': 1065} 18 | 19 | 20 | def test_dimensions_from_tiff(caplog): 21 | caplog.set_level(logging.DEBUG, logger="textractcaller") 22 | textract_client = boto3.client('textract', region_name='us-east-2') 23 | input_file = "s3://amazon-textract-public-content/blogs/multipage_tiff_example_small.tiff" 24 | j = call_textract(input_document=input_file, force_async_api=True, boto3_textract_client=textract_client) 25 | t_document: TDocument = TDocumentSchema().load(j) 26 | add_page_dimensions(t_document=t_document, input_document=input_file) 27 | assert t_document.pages[0].custom['PageDimension'] == {'doc_width': 1333.0, 'doc_height': 1000.0} 28 | assert t_document.pages[1].custom['PageDimension'] == {'doc_width': 1362.0, 'doc_height': 1038.0} 29 | 30 | 31 | def test_s3(): 32 | textract_client = boto3.client('textract', region_name='us-east-2') 33 | input_file = "s3://amazon-textract-public-content/blogs/2-pager-different-dimensions.pdf" 34 | j = call_textract(input_document=input_file, boto3_textract_client=textract_client) 35 | t_document: TDocument = TDocumentSchema().load(j) 36 | add_page_dimensions(t_document=t_document, input_document=input_file) 37 | pages: List[TBlock] = t_document.pages 38 | pages[0].custom['PageDimension'] == {'doc_width': 1549.0, 'doc_height': 370.0} 39 | pages[1].custom['PageDimension'] == {'doc_width': 1079.0, 'doc_height': 505.0} 40 | 41 | 42 | def test_dimensions_from_bytes(): 43 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 44 | input_file = os.path.join(SCRIPT_DIR, "data/Textract-orginal-2021-05-10.png") 45 | with open(input_file, 'rb') as input_document_file: 46 | input_document = input_document_file.read() 47 | j = call_textract(input_document=input_document) 48 | # with open("output.json", 'w') as outfilebla: 49 | # json.dump(obj=j, fp=outfilebla) 50 | t_document: TDocument = TDocumentSchema().load(j) 51 | 52 | with open(input_file, 'rb') as input_document_file: 53 | add_page_dimensions(t_document=t_document, input_document=input_document_file.read()) 54 | assert t_document.pages[0].custom['PageDimension'] == {'doc_width': 1544, 'doc_height': 1065} 55 | -------------------------------------------------------------------------------- /tpipelinepagedimensions/textractpagedimensions/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | 3 | import logging 4 | from logging import NullHandler 5 | 6 | logging.getLogger(__name__).addHandler(NullHandler()) 7 | -------------------------------------------------------------------------------- /tpipelinepagedimensions/textractpagedimensions/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.9' 2 | -------------------------------------------------------------------------------- /tpipelinepagedimensions/textractpagedimensions/t_pagedimensions.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import trp.trp2 as t2 3 | import os 4 | from typing import List, Union 5 | from dataclasses import dataclass, asdict 6 | from PIL import Image, ImageSequence 7 | from pypdf import PdfReader 8 | import boto3 9 | import io 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | only_async_suffixes = ['.pdf'] 14 | tiff_suffixes = ['.tiff', '.tif'] 15 | sync_suffixes = ['.png', '.jpg', '.jpeg'] + tiff_suffixes 16 | supported_suffixes = only_async_suffixes + sync_suffixes 17 | 18 | 19 | @dataclass 20 | class DocumentDimensions(): 21 | doc_width: float 22 | doc_height: float 23 | 24 | 25 | def get_size_from_filestream(fs, ext) -> List[DocumentDimensions]: 26 | return_value: List[DocumentDimensions] = list() 27 | if ext in only_async_suffixes: 28 | # TODO: assumes the order of pages in blocks is correct, when calling Textract with bytes the block.page is empty 29 | input1 = PdfReader(fs) 30 | for page in input1.pages: 31 | pdf_page = page.mediabox 32 | return_value.append(DocumentDimensions(doc_width=float(pdf_page[2]), doc_height=float(pdf_page[3]))) 33 | else: 34 | img = Image.open(fs) 35 | for _, page in enumerate(ImageSequence.Iterator(img)): 36 | return_value.append(DocumentDimensions(doc_width=float(page.width), doc_height=float(page.height))) 37 | return return_value 38 | 39 | 40 | def get_size_from_s3(s3_bucket, s3_key): 41 | _, ext = os.path.splitext(s3_key) 42 | if ext in supported_suffixes: 43 | s3 = boto3.client('s3') 44 | o = s3.get_object(Bucket=s3_bucket, Key=s3_key) 45 | input_bytes = o.get('Body').read() 46 | f = io.BytesIO(input_bytes) 47 | return get_size_from_filestream(f, ext) 48 | else: 49 | raise ValueError(f"{s3_key} not in {supported_suffixes}") 50 | 51 | 52 | def get_width_height_from_s3_object(s3_bucket, s3_key): 53 | return get_size_from_s3(s3_bucket, s3_key) 54 | 55 | 56 | def get_width_height_from_file(filepath): 57 | _, ext = os.path.splitext(filepath) 58 | if ext in supported_suffixes: 59 | with open(filepath, 'rb') as input_fs: 60 | return get_size_from_filestream(input_fs, ext) 61 | else: 62 | raise ValueError(f"{filepath} not in {supported_suffixes}") 63 | 64 | 65 | def add_page_dimensions(t_document: t2.TDocument, input_document: Union[str, bytes]) -> t2.TDocument: 66 | """ 67 | adds Page Dimensions to each page of the document in the form of a custom property on the Block 68 | e. g. {'PageDimension': {'doc_width': 1549.0, 'doc_height': 370.0} } 69 | 70 | """ 71 | page_dimensions: List[DocumentDimensions] = list() 72 | 73 | if isinstance(input_document, str): 74 | if len(input_document) > 7 and input_document.lower().startswith("s3://"): 75 | input_document = input_document.replace("s3://", "") 76 | s3_bucket, s3_key = input_document.split("/", 1) 77 | page_dimensions = get_width_height_from_s3_object(s3_bucket=s3_bucket, s3_key=s3_key) 78 | else: 79 | page_dimensions = get_width_height_from_file(filepath=input_document) 80 | 81 | elif isinstance(input_document, (bytes, bytearray)): 82 | page_dimensions = get_size_from_filestream(io.BytesIO(input_document), ext=None) 83 | # bytes do not return a page for the Block, cannot use the mapping logic as above 84 | if len(t_document.pages) != len(page_dimensions): 85 | raise AssertionError( 86 | f"number of pages in document did not match number of dimensions received: document-pages: {len(t_document.pages)}, dimension-pages: {len(page_dimensions)}" 87 | ) 88 | for idx, block in enumerate(t_document.pages): 89 | if block.custom: 90 | if block.page: 91 | block.custom['PageDimension'] = asdict(page_dimensions[block.page - 1]) 92 | else: 93 | block.custom['PageDimension'] = asdict(page_dimensions[idx]) 94 | else: 95 | if block.page: 96 | block.custom = {'PageDimension': asdict(page_dimensions[block.page - 1])} 97 | else: 98 | block.custom = {'PageDimension': asdict(page_dimensions[idx])} 99 | 100 | return t_document 101 | --------------------------------------------------------------------------------