├── .flake8
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── documentation.yml
    │   ├── lambda_layers.yml
    │   ├── release-caller.yml
    │   ├── release.yml
    │   ├── test-pr-caller.yml
    │   ├── test-pr-geofinder.yml
    │   ├── test-pr-prettyprinter.yml
    │   └── tests.yml
├── .gitignore
├── .style.yapf
├── .yapfignore
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── NOTICE
├── README.md
├── caller
    ├── LICENSE
    ├── Manifest.in
    ├── README.md
    ├── setup.cfg
    ├── setup.py
    ├── tests
    │   ├── data
    │   │   ├── driverlicense.png
    │   │   ├── employmentapp.png
    │   │   ├── employmentapp.tiff
    │   │   ├── json_from_python_repl.json
    │   │   ├── multi_page_tiff.tiff
    │   │   └── verification-of-employment.png
    │   └── test_caller.py
    └── textractcaller
    │   ├── __init__.py
    │   ├── _version.py
    │   └── t_call.py
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── commandline.rst
    │   ├── conf.py
    │   ├── examples.rst
    │   ├── favicon.ico
    │   ├── images
    │       └── lambda_tutorial
    │       │   ├── 1b.png
    │       │   ├── 1c.png
    │       │   ├── 2.png
    │       │   ├── 2a.png
    │       │   ├── 2b.png
    │       │   ├── 3a.png
    │       │   └── 3c.png
    │   ├── index.rst
    │   ├── installation.rst
    │   ├── notebooks
    │       ├── document_linearization_to_markdown_or_html.ipynb
    │       ├── exporting_form_data.ipynb
    │       ├── finding_words_within_a_document.ipynb
    │       ├── going_further.ipynb
    │       ├── imgs
    │       │   └── excel.png
    │       ├── interfacing_with_trp2.ipynb
    │       ├── introduction_to_searching.ipynb
    │       ├── layout_analysis.ipynb
    │       ├── layout_analysis_for_text_linearization.ipynb
    │       ├── parsing_an_existing_response.ipynb
    │       ├── signature_detection.ipynb
    │       ├── simple_ocr.ipynb
    │       ├── table_data_to_various_formats.ipynb
    │       ├── tabular_data_linearization.ipynb
    │       ├── tabular_data_linearization_continued.ipynb
    │       ├── textractor_for_large_language_models.ipynb
    │       ├── using_analyze_expense.ipynb
    │       ├── using_analyze_id.ipynb
    │       ├── using_queries.ipynb
    │       └── visualizing_results.ipynb
    │   ├── overlayer.png
    │   ├── overlayer_bigger.png
    │   ├── textractor.data.constants.rst
    │   ├── textractor.data.text_linearization_config.rst
    │   ├── textractor.entities.rst
    │   ├── textractor.parsers.rst
    │   ├── textractor.rst
    │   ├── textractor.visualizers.rst
    │   ├── textractor_cropped.png
    │   └── using_in_lambda.rst
├── extras
    ├── dev.txt
    ├── docs.txt
    ├── pandas.txt
    ├── pdf.txt
    ├── pdfium.txt
    └── torch.txt
├── helper
    ├── LICENSE
    ├── Manifest.in
    ├── README.md
    ├── bin
    │   └── amazon-textract
    ├── docs
    │   ├── employmentapp_boxed_FORM_CELL_.png
    │   ├── employmentapp_boxed_LINE_TEXT_OVERLAY.png
    │   ├── employmentapp_boxed_WORD_.png
    │   └── employmentapp_boxed_WORD_TEXT_OVERLAY.png
    ├── fonts
    │   └── Roboto-Regular.ttf
    ├── nice_textract.json
    ├── setup.cfg
    ├── setup.py
    ├── textract.json
    └── textracthelper
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── _version.py
    │   └── examples
    │       └── employmentapp.png
├── idp_cdk_manifest
    ├── .gitignore
    ├── LICENSE
    ├── Manifest.in
    ├── README.md
    ├── output.tar.gz
    ├── setup.cfg
    ├── setup.py
    ├── tests
    │   ├── data
    │   │   ├── analyze_id.json
    │   │   ├── manifest_all_features.json
    │   │   ├── manifest_default.json
    │   │   ├── manifest_minimal.json
    │   │   ├── manifest_queries_no_alias.json
    │   │   ├── manifest_queries_no_pages.json
    │   │   ├── manifest_with_classification.json
    │   │   ├── manifest_with_classification_and_metadata.json
    │   │   ├── queries_forms.json
    │   │   └── simple_feature_manifest.json
    │   └── test_manifest.py
    └── textractmanifest
    │   ├── __init__.py
    │   └── manifest.py
├── images
    └── amzn.png
├── overlayer
    ├── LICENSE
    ├── Manifest.in
    ├── README.md
    ├── setup.cfg
    ├── setup.py
    ├── tests
    │   ├── data
    │   │   └── Amazon-Textract-Pdf.pdf
    │   └── test_overlayer.py
    └── textractoverlayer
    │   ├── __init__.py
    │   ├── _version.py
    │   ├── image_tools.py
    │   └── t_overlay.py
├── prettyprinter
    ├── LICENSE
    ├── Manifest.in
    ├── README.md
    ├── setup.cfg
    ├── setup.py
    ├── tests
    │   ├── data
    │   │   ├── analyzeDocResponse.json
    │   │   ├── bounding_box_issue.json
    │   │   ├── employmentapp.json
    │   │   ├── layout_csv_example.json
    │   │   ├── lending-doc-output_from_output_config.json
    │   │   ├── multi_page_example_file.json
    │   │   ├── queries_one_no_answer.json
    │   │   ├── request_for_verification_of_employment.json
    │   │   └── w2-example.json
    │   └── test_pretty_print.py
    └── textractprettyprinter
    │   ├── __init__.py
    │   ├── _version.py
    │   ├── t_pretty_print.py
    │   ├── t_pretty_print_expense.py
    │   └── t_pretty_print_layout.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── fixtures
    │   ├── amzn_q2.png
    │   ├── fake_id.png
    │   ├── form.png
    │   ├── form_1005.png
    │   ├── in-table-title.png
    │   ├── invalid.pdf
    │   ├── invoice.png
    │   ├── matrix.png
    │   ├── multiline_cells.jpeg
    │   ├── patient_intake_form_sample.png
    │   ├── paystub.jpg
    │   ├── paystub_header.png
    │   ├── paystub_single_table.png
    │   ├── paystub_tables.png
    │   ├── reading_order.pdf
    │   ├── receipt.jpg
    │   ├── receipt_no_summary.png
    │   ├── resume.png
    │   ├── sample-invoice.pdf
    │   ├── saved_api_responses
    │   │   ├── test_analyze_expense_from_image.json
    │   │   ├── test_analyze_expense_from_path.json
    │   │   ├── test_analyze_expense_no_summary_fields.json
    │   │   ├── test_analyze_id_from_image.json
    │   │   ├── test_analyze_id_from_path.json
    │   │   ├── test_bad_queries_as_strings.json
    │   │   ├── test_detect_document_text.json
    │   │   ├── test_detect_document_text_list_PIL_images.json
    │   │   ├── test_detect_document_text_single_page_pdf_input.json
    │   │   ├── test_detect_no_duplicate_words_amzn_q2.png.json
    │   │   ├── test_detect_no_duplicate_words_fake_id.png.json
    │   │   ├── test_detect_no_duplicate_words_form.png.json
    │   │   ├── test_detect_no_duplicate_words_form_1005.png.json
    │   │   ├── test_detect_no_duplicate_words_in-table-title.png.json
    │   │   ├── test_detect_no_duplicate_words_matrix.png.json
    │   │   ├── test_detect_no_duplicate_words_patient_intake_form_sample.png.json
    │   │   ├── test_detect_no_duplicate_words_paystub.jpg.json
    │   │   ├── test_detect_no_duplicate_words_paystub_header.png.json
    │   │   ├── test_detect_no_duplicate_words_paystub_single_table.png.json
    │   │   ├── test_detect_no_duplicate_words_paystub_tables.png.json
    │   │   ├── test_detect_no_duplicate_words_reading_order.pdf.json
    │   │   ├── test_detect_no_duplicate_words_receipt.jpg.json
    │   │   ├── test_detect_no_duplicate_words_sample-invoice.pdf.json
    │   │   ├── test_detect_no_duplicate_words_screenshot.png.json
    │   │   ├── test_detect_no_duplicate_words_single-page-1.png.json
    │   │   ├── test_detect_no_duplicate_words_single-page-2.png.json
    │   │   ├── test_detect_no_duplicate_words_test.png.json
    │   │   ├── test_detect_no_duplicate_words_textractor-singlepage-doc.pdf.json
    │   │   ├── test_detect_no_duplicate_words_tutorial.pdf.json
    │   │   ├── test_document_smoke_test.json
    │   │   ├── test_document_to_html_amzn_q2.png.json
    │   │   ├── test_document_to_html_fake_id.png.json
    │   │   ├── test_document_to_html_form.png.json
    │   │   ├── test_document_to_html_form_1005.png.json
    │   │   ├── test_document_to_html_in-table-title.png.json
    │   │   ├── test_document_to_html_matrix.png.json
    │   │   ├── test_document_to_html_patient_intake_form_sample.png.json
    │   │   ├── test_document_to_html_paystub.jpg.json
    │   │   ├── test_document_to_html_paystub_header.png.json
    │   │   ├── test_document_to_html_paystub_single_table.png.json
    │   │   ├── test_document_to_html_paystub_tables.png.json
    │   │   ├── test_document_to_html_reading_order.pdf.json
    │   │   ├── test_document_to_html_receipt.jpg.json
    │   │   ├── test_document_to_html_sample-invoice.pdf.json
    │   │   ├── test_document_to_html_screenshot.png.json
    │   │   ├── test_document_to_html_single-page-1.png.json
    │   │   ├── test_document_to_html_single-page-2.png.json
    │   │   ├── test_document_to_html_test.png.json
    │   │   ├── test_document_to_html_textractor-singlepage-doc.pdf.json
    │   │   ├── test_document_to_html_tutorial.pdf.json
    │   │   ├── test_document_to_markdown_amzn_q2.png.json
    │   │   ├── test_document_to_markdown_fake_id.png.json
    │   │   ├── test_document_to_markdown_form.png.json
    │   │   ├── test_document_to_markdown_form_1005.png.json
    │   │   ├── test_document_to_markdown_in-table-title.png.json
    │   │   ├── test_document_to_markdown_matrix.png.json
    │   │   ├── test_document_to_markdown_patient_intake_form_sample.png.json
    │   │   ├── test_document_to_markdown_paystub_header.png.json
    │   │   ├── test_document_to_markdown_paystub_single_table.png.json
    │   │   ├── test_document_to_markdown_paystub_tables.png.json
    │   │   ├── test_document_to_markdown_reading_order.pdf.json
    │   │   ├── test_document_to_markdown_receipt.jpg.json
    │   │   ├── test_document_to_markdown_sample-invoice.pdf.json
    │   │   ├── test_document_to_markdown_screenshot.png.json
    │   │   ├── test_document_to_markdown_single-page-1.png.json
    │   │   ├── test_document_to_markdown_single-page-2.png.json
    │   │   ├── test_document_to_markdown_test.png.json
    │   │   ├── test_document_to_markdown_textractor-singlepage-doc.pdf.json
    │   │   ├── test_figure_layout_prefixes_and_suffixes_in_text_words.json
    │   │   ├── test_layout.json
    │   │   ├── test_page.json
    │   │   ├── test_queries_as_strings.json
    │   │   ├── test_signature.json
    │   │   ├── test_table.json
    │   │   ├── test_table_prefixes_and_suffixes_in_text.json
    │   │   ├── test_table_prefixes_and_suffixes_in_words.json
    │   │   ├── test_table_with_title_and_footers.json
    │   │   ├── test_textractor_analyze_document.json
    │   │   ├── test_textractor_analyze_document_local_pillow_image.json
    │   │   ├── test_textractor_analyze_document_multipage_pdf.json
    │   │   ├── test_textractor_analyze_document_pillow_image_list.json
    │   │   ├── test_textractor_s3_image_input.json
    │   │   ├── test_textractor_start_document_analysis.json
    │   │   ├── test_textractor_start_document_analysis_multipage_pdf_s3.json
    │   │   ├── test_textractor_start_document_text_detection.json
    │   │   ├── test_textractor_start_document_text_detection_multipage_pdf_s3.json
    │   │   └── test_word_ordering_in_cell.json
    │   ├── screenshot.png
    │   ├── signature.jpg
    │   ├── single-page-1.png
    │   ├── single-page-2.png
    │   ├── test.png
    │   ├── textractor-multipage-doc.pdf
    │   ├── textractor-singlepage-doc.pdf
    │   ├── titanic.webp
    │   ├── tutorial.pdf
    │   ├── vbat.png
    │   └── vbat2.png
    ├── invoice_sample.pdf
    ├── test_analyze_expense.py
    ├── test_analyze_id.py
    ├── test_bbox.py
    ├── test_document.py
    ├── test_get_text_and_words.py
    ├── test_key_value.py
    ├── test_layout.py
    ├── test_line.py
    ├── test_page.py
    ├── test_parse_no_fail.py
    ├── test_queries.py
    ├── test_selection_element.py
    ├── test_signature.py
    ├── test_table.py
    ├── test_textractor.py
    ├── test_textractor_cli.py
    ├── test_value.py
    ├── test_visualizer.py
    ├── test_word.py
    ├── test_word_ordering.py
    └── utils.py
├── textractor
    ├── __init__.py
    ├── cli
    │   ├── __init__.py
    │   └── cli.py
    ├── data
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── html_linearization_config.py
    │   ├── markdown_linearization_config.py
    │   └── text_linearization_config.py
    ├── entities
    │   ├── __init__.py
    │   ├── bbox.py
    │   ├── document.py
    │   ├── document_entity.py
    │   ├── expense_document.py
    │   ├── expense_field.py
    │   ├── identity_document.py
    │   ├── identity_field.py
    │   ├── key_value.py
    │   ├── layout.py
    │   ├── lazy_document.py
    │   ├── line.py
    │   ├── linearizable.py
    │   ├── page.py
    │   ├── page_layout.py
    │   ├── query.py
    │   ├── query_result.py
    │   ├── selection_element.py
    │   ├── signature.py
    │   ├── table.py
    │   ├── table_cell.py
    │   ├── table_footer.py
    │   ├── table_title.py
    │   ├── value.py
    │   └── word.py
    ├── exceptions.py
    ├── parsers
    │   ├── __init__.py
    │   └── response_parser.py
    ├── textractor.py
    ├── utils
    │   ├── __init__.py
    │   ├── geometry_util.py
    │   ├── html_utils.py
    │   ├── legacy_utils.py
    │   ├── pdf_utils.py
    │   ├── results_utils.py
    │   ├── s3_utils.py
    │   ├── search_utils.py
    │   └── text_utils.py
    └── visualizers
    │   ├── __init__.py
    │   ├── arial.ttf
    │   └── entitylist.py
├── tpipelinegeofinder
    ├── LICENSE
    ├── Manifest.in
    ├── README.md
    ├── geofinder-sample-notebook.ipynb
    ├── setup.cfg
    ├── setup.py
    ├── tests
    │   ├── data
    │   │   ├── multi_page_example_file.json
    │   │   ├── multi_page_example_file.pdf
    │   │   ├── patient_intake_form_sample.jpg
    │   │   ├── patient_intake_form_sample.json
    │   │   ├── test_sample.json
    │   │   └── tquery_samples.json
    │   ├── test_ocrdb.py
    │   ├── test_tgeofinder.py
    │   └── test_tword.py
    └── textractgeofinder
    │   ├── __init__.py
    │   ├── _version.py
    │   ├── ocrdb.py
    │   ├── sample_patient_intake_form_parser.py
    │   ├── tgeofinder.py
    │   ├── tinterface.py
    │   └── tword.py
└── tpipelinepagedimensions
    ├── LICENSE
    ├── Manifest.in
    ├── README.md
    ├── setup.cfg
    ├── setup.py
    ├── tests
        ├── data
        │   └── Textract-orginal-2021-05-10.png
        └── test_pagedimensions.py
    └── textractpagedimensions
        ├── __init__.py
        ├── _version.py
        └── t_pagedimensions.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
7 | 


--------------------------------------------------------------------------------
/.github/workflows/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ 'master' ]
 6 |   pull_request:
 7 | 
 8 |   workflow_dispatch:
 9 | 
10 | # Compile the docs and deploy to GitHub pages
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       # Checks out the repository
17 |       - uses: actions/checkout@v3
18 |         with:
19 |           ref: 'master'
20 | 
21 |       - name: Install pandoc
22 |         run: sudo apt-get install -y pandoc
23 | 
24 |       # Upgrade pip
25 |       - name: Upgrade pip
26 |         run: |
27 |           # install pip=>20.1 to use "pip cache dir"
28 |           python3 -m pip install --upgrade pip
29 | 
30 |       # Cache dependencies
31 |       - name: Get pip cache dir
32 |         id: pip-cache
33 |         run: echo "dir=$(pip cache dir)" >> "$GITHUB_OUTPUT"
34 | 
35 |       - name: Cache dependencies
36 |         uses: actions/cache@v4
37 |         with:
38 |           path: ${{ steps.pip-cache.outputs.dir }}
39 |           key: ${{ runner.os }}-pip-${{ hashFiles('**/extras/docs.txt') }}
40 |           restore-keys: |
41 |             ${{ runner.os }}-pip-
42 | 
43 |       # Install base dependencies
44 |       - name: Install dependencies
45 |         run: python3 -m pip install -r requirements.txt
46 | 
47 |       # Install sphinx
48 |       - name: Install dependencies
49 |         run: python3 -m pip install -r ./extras/docs.txt
50 | 
51 |       # Make docs
52 |       - name: Build docs
53 |         run: cd docs && make html
54 |         
55 |       # Deploy
56 |       - name: Deploy
57 |         uses: peaceiris/actions-gh-pages@v3
58 |         with:
59 |           github_token: ${{ secrets.GITHUB_TOKEN }}
60 |           publish_dir: ./docs/build/html/
61 | 


--------------------------------------------------------------------------------
/.github/workflows/release-caller.yml:
--------------------------------------------------------------------------------
 1 | name: Release Caller
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Main release"]
 6 |     types: [completed]
 7 | 
 8 |   workflow_dispatch:
 9 | 
10 | # Package and upload the Python package
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       # Checks out the repository
17 |       - uses: actions/checkout@v3
18 |         with:
19 |           ref: "master"
20 | 
21 |       # Upgrade pip
22 |       - name: Upgrade pip
23 |         run: |
24 |           # install pip=>20.1 to use "pip cache dir"
25 |           python3 -m pip install --upgrade pip
26 | 
27 |       # Cache dependencies
28 |       - name: Get pip cache dir
29 |         id: pip-cache
30 |         run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
31 | 
32 |       - name: Cache dependencies
33 |         uses: actions/cache@v3
34 |         with:
35 |           path: ${{ steps.pip-cache.outputs.dir }}
36 |           key: ${{ runner.os }}-pip-${{ hashFiles('**/extras/docs.txt') }}
37 |           restore-keys: |
38 |             ${{ runner.os }}-pip-
39 | 
40 |       # Install twine
41 |       - name: Install dependencies
42 |         run: python3 -m pip install twine
43 | 
44 |       # Make docs
45 |       - name: Build sdist and wheels
46 |         run: |
47 |           cd caller
48 |           python3 setup.py bdist_wheel
49 |           python3 setup.py sdist
50 |           mv dist ..
51 | 
52 |       # Upload to PyPI
53 |       - name: Publish distribution to PyPI
54 |         uses: pypa/gh-action-pypi-publish@release/v1
55 |         with:
56 |           password: ${{ secrets.PYPI_API_KEY_CALLER }}
57 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Main release"]
 6 |     types: [completed]
 7 | 
 8 |   workflow_dispatch:
 9 | 
10 | # Package and upload the Python package
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       # Checks out the repository
17 |       - uses: actions/checkout@v3
18 |         with:
19 |           ref: "master"
20 | 
21 |       # Upgrade pip
22 |       - name: Upgrade pip
23 |         run: |
24 |           # install pip=>20.1 to use "pip cache dir"
25 |           python3 -m pip install --upgrade pip
26 | 
27 |       # Cache dependencies
28 |       - name: Get pip cache dir
29 |         id: pip-cache
30 |         run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
31 | 
32 |       - name: Cache dependencies
33 |         uses: actions/cache@v3
34 |         with:
35 |           path: ${{ steps.pip-cache.outputs.dir }}
36 |           key: ${{ runner.os }}-pip-${{ hashFiles('**/extras/docs.txt') }}
37 |           restore-keys: |
38 |             ${{ runner.os }}-pip-
39 | 
40 |       # Install twine
41 |       - name: Install dependencies
42 |         run: python3 -m pip install twine
43 | 
44 |       # Make docs
45 |       - name: Build sdist and wheels
46 |         run: |
47 |           python3 setup.py bdist_wheel
48 |           python3 setup.py sdist
49 | 
50 |       # Upload to PyPI
51 |       - name: Publish distribution to PyPI
52 |         uses: pypa/gh-action-pypi-publish@release/v1
53 |         with:
54 |           password: ${{ secrets.PYPI_API_KEY_TEXTRACTOR }}
55 | 


--------------------------------------------------------------------------------
/.github/workflows/test-pr-caller.yml:
--------------------------------------------------------------------------------
 1 | # Controls when the action will run. Triggers the workflow on push or pull request
 2 | # events but only for the main branch and changes in folder src-python
 3 | name: Test-Pull-Request-Caller
 4 | on:
 5 |   pull_request:
 6 |     paths:
 7 |       - caller
 8 |   workflow_dispatch: {}
 9 | 
10 | # Run the tests
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
17 |     defaults:
18 |       run:
19 |         working-directory: ./caller
20 |     permissions:
21 |         id-token: write
22 |         contents: write
23 | 
24 |     steps:
25 |       # Checks out the repository
26 |       - uses: actions/checkout@v3
27 |       - name: configure aws credentials
28 |         uses: aws-actions/configure-aws-credentials@v1-node16
29 |         with:
30 |           role-to-assume: arn:aws:iam::913165245630:role/GithubActionsOIDC-Role-1U7IPQFU9Q8RS
31 |           role-duration-seconds: 900 # the ttl of the session, in seconds.
32 |           aws-region: us-east-1 # use your region here.
33 |       - name: Set up Python ${{ matrix.python-version }}
34 |         uses: actions/setup-python@v4
35 |         with:
36 |           python-version: ${{ matrix.python-version }}
37 | 
38 |       # Install package locally
39 |       - name: Install package
40 |         run: python -m pip install -e .
41 | 
42 |       # Install dev dependencies
43 |       - name: Install dependencies
44 |         run: |
45 |           python -m pip install --upgrade pip
46 |           python -m pip install pytest
47 |       # Run tests
48 |       - name: Test
49 |         run: pytest
50 | 


--------------------------------------------------------------------------------
/.github/workflows/test-pr-geofinder.yml:
--------------------------------------------------------------------------------
 1 | # Controls when the action will run. Triggers the workflow on push or pull request
 2 | # events but only for the main branch and changes in folder src-python
 3 | name: Test-Pull-Request-Geofinder
 4 | on:
 5 |   pull_request:
 6 |     paths:
 7 |       - tpipelinegeofinder
 8 |   workflow_dispatch:
 9 | 
10 | # Run the tests
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
17 |     defaults:
18 |       run:
19 |         working-directory: ./tpipelinegeofinder
20 | 
21 |     steps:
22 |       # Checks out the repository
23 |       - uses: actions/checkout@v3
24 |       - name: Set up Python ${{ matrix.python-version }}
25 |         uses: actions/setup-python@v4
26 |         with:
27 |           python-version: ${{ matrix.python-version }}
28 | 
29 |       # Install package locally
30 |       - name: Install package
31 |         run: python -m pip install -e .
32 | 
33 |       # Install dev dependencies
34 |       - name: Install dependencies
35 |         run: |
36 |           python -m pip install --upgrade pip
37 |           pip install pytest
38 |       # Run tests
39 |       - name: Test
40 |         run: pytest
41 | 


--------------------------------------------------------------------------------
/.github/workflows/test-pr-prettyprinter.yml:
--------------------------------------------------------------------------------
 1 | # Controls when the action will run. Triggers the workflow on push or pull request
 2 | # events but only for the main branch and changes in folder src-python
 3 | name: Test-Pull-Request-PrettyPrinter
 4 | on:
 5 |   pull_request:
 6 |     paths:
 7 |       - prettyprinter
 8 | 
 9 |   workflow_dispatch: {}
10 | 
11 | # Run the tests
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
18 |     defaults:
19 |       run:
20 |         working-directory: ./prettyprinter
21 |     steps:
22 |       # Checks out the repository
23 |       - uses: actions/checkout@v3
24 |       - name: Set up Python ${{ matrix.python-version }}
25 |         uses: actions/setup-python@v4
26 |         with:
27 |           python-version: ${{ matrix.python-version }}
28 |           ref: ${{ github.event.pull_request.head.ref }}
29 |           repository: ${{ github.event.pull_request.head.repo.full_name }}
30 |       # Install package locally
31 |       - name: Install package
32 |         run: python -m pip install -e .
33 |       # Install dev dependencies
34 |       - name: Install dependencies
35 |         run: |
36 |           python -m pip install --upgrade pip
37 |           python -m pip install pytest
38 |       # Run tests
39 |       - name: Test
40 |         run: pytest
41 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   # TODO: Change the branch to master once merged.
 5 |   push:
 6 |     branches: [ 'master' ]
 7 |   pull_request:
 8 | 
 9 |   workflow_dispatch:
10 | 
11 | # Run the tests
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |       # Checks out the repository
18 |       - uses: actions/checkout@v3
19 |         with:
20 |           ref: 'master'
21 | 
22 |       # Upgrade pip
23 |       - name: Upgrade pip
24 |         run: |
25 |           # install pip=>20.1 to use "pip cache dir"
26 |           python3 -m pip install --upgrade pip
27 | 
28 |       # Cache dependencies
29 |       - name: Get pip cache dir
30 |         id: pip-cache
31 |         run: echo "dir=$(pip cache dir)" >> "$GITHUB_OUTPUT"
32 | 
33 |       - name: Cache dependencies
34 |         uses: actions/cache@v4
35 |         with:
36 |           path: ${{ steps.pip-cache.outputs.dir }}
37 |           key: ${{ runner.os }}-pip-${{ hashFiles('**/extras/docs.txt') }}
38 |           restore-keys: |
39 |             ${{ runner.os }}-pip-
40 | 
41 |       # Install base dependencies
42 |       - name: Install dependencies
43 |         run: python3 -m pip install -r requirements.txt
44 | 
45 |       # Install dev dependencies
46 |       - name: Install dependencies
47 |         run: python3 -m pip install -r ./extras/dev.txt
48 | 
49 |       # Run tests
50 |       - name: Test
51 |         run: pytest tests/
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | __pycache__
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | env/
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | .idea/*
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # dotenv
 88 | .env
 89 | 
 90 | # virtualenv
 91 | .venv
 92 | venv/
 93 | ENV/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
108 | # remove test with fixed input-document locations on S3 till we add a Textract and S3 mock
109 | test_local*
110 | 
111 | share/python-wheels/
112 | *.egg-info/
113 | .installed.cfg
114 | *.egg
115 | MANIFEST
116 | .#
117 | 
118 | .dir-locals.el
119 | 
120 | .vscode
121 | 
122 | .envrc
123 | env/
124 | env2/
125 | env3/
126 | env4/
127 | env5/
128 | *.csv
129 | lambda_layer/*
130 | textractor.zip
131 | python/*
132 | 


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
1 | [style]
2 | based_on_style = pep8
3 | spaces_before_comment = 4
4 | split_before_logical_operator = true
5 | column_limit: 120
6 | 


--------------------------------------------------------------------------------
/.yapfignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/.yapfignore


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | # This CITATION.cff file was generated with cffinit.
 2 | # Visit https://bit.ly/cffinit to generate yours today!
 3 | 
 4 | cff-version: 1.2.0
 5 | title: Amazon Textractor
 6 | message: >-
 7 |   If you use this software, please cite it using the
 8 |   metadata from this file.
 9 | type: software
10 | authors:
11 |   - given-names: Edouard
12 |     family-names: Belval
13 |     affiliation: AWS AI
14 |   - given-names: Thomas
15 |     family-names: Delteil
16 |     affiliation: AWS AI
17 |   - given-names: Martin
18 |     family-names: Schade
19 |     affiliation: AWS AI
20 |   - given-names: Srividhya
21 |     family-names: Radhakrishna
22 |     affiliation: AWS AI
23 | repository-code: 'https://github.com/aws-samples/amazon-textract-textractor'
24 | url: 'https://aws-samples.github.io/amazon-textract-textractor/'
25 | license: Apache-2.0


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/aws-samples/amazon-textract-textractor/issues), or [recently closed](https://github.com/aws-samples/amazon-textract-textractor/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/amazon-textract-textractor/labels/help%20wanted) issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/aws-samples/amazon-textract-textractor/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | recursive-include extras *.txt
3 | recursive-include textractor *


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Amazon Textract Textractor
2 | Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 
3 | 


--------------------------------------------------------------------------------
/caller/Manifest.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 
3 | recursive-exclude * __pycache__
4 | recursive-exclude * *.py[co] test_*.py
5 | 
6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif


--------------------------------------------------------------------------------
/caller/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.2.2
 3 | commit = False
 4 | tag = False
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:textractcaller/_version.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [aliases]
21 | 


--------------------------------------------------------------------------------
/caller/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from setuptools import setup, find_packages
 4 | 
 5 | 
 6 | def read(fname):
 7 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 8 | 
 9 | 
10 | requirements = ['boto3>=1.26.35', 'botocore', 'amazon-textract-response-parser>=0.1.39']
11 | 
12 | if sys.argv[-1] == 'publish-test':
13 |     os.system(f"cd {os.path.dirname(__file__)}")
14 |     os.system('rm -rf dist/ build/ amazon_textract_caller.egg-info/')
15 |     os.system('python setup.py sdist bdist_wheel')
16 |     os.system('twine check dist/*')
17 |     os.system('twine upload --repository pypitest dist/*')
18 |     sys.exit()
19 | 
20 | if sys.argv[-1] == 'publish':
21 |     os.system(f"cd {os.path.dirname(__file__)}")
22 |     os.system('rm -rf dist/ build/ amazon_textract_caller.egg-info/')
23 |     os.system('python setup.py sdist bdist_wheel')
24 |     os.system('twine check dist/*')
25 |     os.system('twine upload --repository pypi dist/*')
26 |     sys.exit()
27 | 
28 | setup(name='amazon-textract-caller',
29 |       packages=find_packages(exclude=['tests']),
30 |       include_package_data=True,
31 |       exclude_package_data={"": ["test_*.py", "__pycache__"]},
32 |       version='0.2.4',
33 |       description='Amazon Textract Caller tools',
34 |       install_requires=requirements,
35 |       extras_require={'testing': ['amazon-textract-response-parser', 'pytest']},
36 |       long_description_content_type='text/markdown',
37 |       long_description=read('README.md'),
38 |       author='Amazon Rekognition Textract Demoes',
39 |       author_email='rekognition-textract-demos@amazon.com',
40 |       url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/caller',
41 |       keywords='amazon-textract-textractor amazon textract textractor helper caller',
42 |       license="Apache License Version 2.0",
43 |       classifiers=[
44 |           "Development Status :: 4 - Beta",
45 |           "Topic :: Utilities",
46 |           'License :: OSI Approved :: Apache Software License',
47 |           'Programming Language :: Python :: 3.8',
48 |           'Programming Language :: Python :: 3.9',
49 |           'Programming Language :: Python :: 3.10',
50 |           'Programming Language :: Python :: 3.11',
51 |           'Programming Language :: Python :: 3.12',
52 |       ],
53 |       python_requires='>=3.6')
54 | 


--------------------------------------------------------------------------------
/caller/tests/data/driverlicense.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/caller/tests/data/driverlicense.png


--------------------------------------------------------------------------------
/caller/tests/data/employmentapp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/caller/tests/data/employmentapp.png


--------------------------------------------------------------------------------
/caller/tests/data/employmentapp.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/caller/tests/data/employmentapp.tiff


--------------------------------------------------------------------------------
/caller/tests/data/multi_page_tiff.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/caller/tests/data/multi_page_tiff.tiff


--------------------------------------------------------------------------------
/caller/tests/data/verification-of-employment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/caller/tests/data/verification-of-employment.png


--------------------------------------------------------------------------------
/caller/textractcaller/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import __version__
2 | from .t_call import NotificationChannel, OutputConfig, DocumentLocation, Document, get_job_response, get_full_json_from_output_config, get_full_json, call_textract, Textract_Features, call_textract_analyzeid, DocumentPage, QueriesConfig, Query, AdaptersConfig, Adapter, call_textract_expense, Textract_Call_Mode, Textract_API, Textract_Types, call_textract_lending, get_full_json_lending, get_full_json_lending_from_output_config, get_s3_output_config_keys
3 | 
4 | import logging
5 | from logging import NullHandler
6 | 
7 | logging.getLogger(__name__).addHandler(NullHandler())
8 | 


--------------------------------------------------------------------------------
/caller/textractcaller/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.2.2'
2 | 
3 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/commandline.rst:
--------------------------------------------------------------------------------
 1 | CLI
 2 | ===
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 | Textractor comes with its very own command line interface that aims to be easier to use than the default `boto3` interface by adding several quality of life improvements.
 8 | 
 9 | First install the package using :code:`pip install amazon-textract-textractor` make sure that you Python bin directory is added to PATH otherwise it will not find the executable. If you are not using a virtual environment this will probably be the case.
10 | 
11 | Available APIs
12 | ______________
13 | 
14 | :code:`Textractor` supports all Textract APIs and follow their official names as described here: https://docs.aws.amazon.com/textract/latest/dg/API_Operations.html. We use a single subcommand to fetch the results named :code:`GetResult`.
15 | 
16 | Synchronous APIs:
17 | 
18 | - DetectDocumentText/detect-document-text (Returns words and lines)
19 | - AnalyzeDocument/analyze-document (Returns Forms, Tables and Query results)
20 | - AnalyzeExpense/analyze-expense (Returns standardized fields for invoices)
21 | - AnalyzeID/analyze-id (Returns standardized fields for driver's license and passports)
22 | 
23 | Asynchronous APIs: 
24 | 
25 | - StartDocumentTextDetection/start-document-text-detection
26 | - StartDocumentAnalysis/start-document-analysis
27 | - StartExpenseAnalysis/start-expense-analysis
28 | 
29 | Getting document text
30 | _____________________
31 | 
32 | Now lets say you have a file and you wish to run OCR on it:
33 | 
34 | :code:`textractor detect-document-text your_file.png output.json`
35 | 
36 | This will call the Textract API and save the output to :code:`output.json`. You could use the Textractor python module to post-process those response afterwards.
37 | 
38 | Processing a directory of files
39 | _______________________________
40 | 
41 | Now if instead of a file, you wished to process an entire directory of files. You could call the above on every file in the directory, but this would prove to be a very long process. Instead you can leverage Textract's ability to scale to your workload using the asynchronous API.
42 | 
43 | :code:`ls your_dir/ | xargs -I{} textractor start-document-text-detection {} --s3-upload-path s3://your-bucket/your-prefix/{}`
44 | 
45 | You can also parallelize it simply by adding -P8 (for 8 concurrent processes).
46 | 
47 | :code:`ls your_dir/ | xargs -P8 -I{} textractor start-document-text-detection {} --s3-upload-path s3://your-bucket/your-prefix/{} > output.txt`
48 | 
49 | You will notice that all you have in output.txt are UUID like this: :code:`628e39089ffa1b52d62d980ec1cf4f62cb7f785c83a708b2e17ebaaf21ad0d61`. Those are JobIDs and can be used to fetch the output of asynchronous operations.
50 | 
51 | Wait a few minutes (dependending on the number of files your processed) and then fetch the result  with :code:`GetResult`.
52 | 
53 | :code:`cat output.txt | xargs -I{} textractor get-result {} DETECT_TEXT {}.json`
54 | 
55 | Using :code:`-P8` would make the above faster, but be careful not to increase the concurrent process count too much as you might run into rate limiting issues (See https://docs.aws.amazon.com/textract/latest/dg/limits.html for more details).
56 | 
57 | Visualizing the output
58 | ______________________
59 | 
60 | The :code:`textractor` CLI allows you to overlay the output of Amazon Textract on top of an image for troubleshooting. It is only available for synchronous APIs (DetectDocumentText, AnalyzeDocument) and allows you to visualize words, lines, key and values, and tables.
61 | 
62 | In this example we will overlay words and tables on top of the :code:`tests/fixtures/amzn_q2.png` file. The image will be created in the same directory as the :code:`output.json` file under the name :code:`output.json.png`.
63 | 
64 | :code:`textractor analyze-document tests/fixtures/amzn_q2.png output.json --features TABLES --overlay WORDS TABLES` 
65 | 
66 | This will yield the following (click to enlarge):
67 | 
68 | .. image:: overlayer.png
69 |   :width: 600
70 |   :alt: Overlayer output
71 | 
72 | This document has a lot of small words, making it difficult to read. You can add :code:`--font-size-ratio` to the command to increase the font size.
73 | 
74 | :code:`textractor analyze-document tests/fixtures/amzn_q2.png output.json --features TABLES --overlay WORDS TABLES --font-size-ratio 1.0` (default it 0.75)
75 | 
76 | .. image:: overlayer_bigger.png
77 |   :width: 600
78 |   :alt: Overlayer output bigger
79 | 
80 | Reference
81 | _________
82 | 
83 | .. argparse::
84 |    :ref: textractor.cli.cli._build_parser
85 |    :prog: textractor


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | sys.path.insert(0, os.path.abspath("../../"))
17 | 
18 | 
19 | # -- Project information -----------------------------------------------------
20 | 
21 | project = "amazon-textract-textractor"
22 | copyright = "2022, Amazon"
23 | author = "Edouard Belval"
24 | 
25 | # The full version, including alpha/beta/rc tags
26 | release = "1.0.0"
27 | html_favicon = "favicon.ico"
28 | 
29 | # -- General configuration ---------------------------------------------------
30 | 
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 |     "sphinx.ext.autodoc",
36 |     "nbsphinx",
37 |     "sphinxarg.ext",
38 | ]
39 | 
40 | # Add any paths that contain templates here, relative to this directory.
41 | templates_path = ["_templates"]
42 | 
43 | # List of patterns, relative to source directory, that match files and
44 | # directories to ignore when looking for source files.
45 | # This pattern also affects html_static_path and html_extra_path.
46 | exclude_patterns = []
47 | 
48 | 
49 | # -- Options for HTML output -------------------------------------------------
50 | 
51 | # The theme to use for HTML and HTML Help pages.  See the documentation for
52 | # a list of builtin themes.
53 | #
54 | html_theme = "sphinx_rtd_theme"
55 | 
56 | # Add any paths that contain custom static files (such as style sheets) here,
57 | # relative to this directory. They are copied after the builtin static files,
58 | # so a file named "default.css" will overwrite the builtin "default.css".
59 | html_static_path = ["_static"]
60 | 


--------------------------------------------------------------------------------
/docs/source/examples.rst:
--------------------------------------------------------------------------------
 1 | Examples
 2 | ========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 | 
 7 |    notebooks/simple_ocr
 8 |    notebooks/parsing_an_existing_response
 9 |    notebooks/introduction_to_searching
10 |    notebooks/visualizing_results
11 |    notebooks/finding_words_within_a_document
12 |    notebooks/exporting_form_data
13 |    notebooks/table_data_to_various_formats
14 |    notebooks/using_analyze_expense
15 |    notebooks/using_analyze_id
16 |    notebooks/using_queries
17 |    notebooks/layout_analysis
18 |    notebooks/tabular_data_linearization
19 |    notebooks/tabular_data_linearization_continued
20 |    notebooks/layout_analysis_for_text_linearization
21 |    notebooks/document_linearization_to_markdown_or_html
22 |    notebooks/textractor_for_large_language_models
23 |    notebooks/interfacing_with_trp2
24 |    notebooks/signature_detection
25 |    notebooks/going_further
26 | 


--------------------------------------------------------------------------------
/docs/source/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/favicon.ico


--------------------------------------------------------------------------------
/docs/source/images/lambda_tutorial/1b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/1b.png


--------------------------------------------------------------------------------
/docs/source/images/lambda_tutorial/1c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/1c.png


--------------------------------------------------------------------------------
/docs/source/images/lambda_tutorial/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/2.png


--------------------------------------------------------------------------------
/docs/source/images/lambda_tutorial/2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/2a.png


--------------------------------------------------------------------------------
/docs/source/images/lambda_tutorial/2b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/2b.png


--------------------------------------------------------------------------------
/docs/source/images/lambda_tutorial/3a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/3a.png


--------------------------------------------------------------------------------
/docs/source/images/lambda_tutorial/3c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/images/lambda_tutorial/3c.png


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Textractor Documentation
 2 | ========================
 3 | 
 4 | .. image:: textractor_cropped.png
 5 |   :alt: Textractor
 6 | 
 7 | **Textractor** is a python package created to seamlessly work with 4 popular `Amazon Textract <https://docs.aws.amazon.com/textract/latest/dg/what-is.html>`_
 8 | APIs. These are the DocumentTextDetection, StartDocumentTextDetection, AnalyzeDocument and StartDocumentAnalysis endpoints. The package contains utilities to call Textract services, 
 9 | convert JSON responses from API calls to programmable objects, visualize entities on the document and export document data is compatible formats. 
10 | It is intended to aid Textract customers in setting up their post-processing pipelines.
11 | 
12 | Previous work in this space has been made available in the following packages:
13 | 
14 | 1. `amazon-textract-caller <https://pypi.org/project/amazon-textract-caller/>`_ (to call textract without the explicit use of boto3)
15 | 
16 | 2. `amazon-textract-response-parser <https://pypi.org/project/amazon-textract-response-parser/>`_ (to parse the JSON response returned by Textract APIs)
17 | 
18 | 3. `amazon-textract-overlayer <https://pypi.org/project/amazon-textract-overlayer/>`_ (to draw bounding boxes around the document entities on the document image)
19 | 
20 | 4. `amazon-textract-prettyprinter <https://pypi.org/project/amazon-textract-prettyprinter/>`_ (to string represent document entities)
21 | 
22 | 5. `amazon-textract-directional_finder <https://pypi.org/project/amazon-textract-directional_finder/>`_ (to perform geometric search on the document)
23 | 
24 | 
25 | The `amazon-textract-caller <https://pypi.org/project/amazon-textract-caller/>`_ has been used as a dependency within this package 
26 | with a wrapper around it to reduce the number of parameters the customer needs to pass. Additionally, newer input formats for the 
27 | document have been provisioned with this package. 
28 | 
29 | The remaining packages have been refactored within this new package but the prominent functionalities are all made available to not disrupt
30 | the requirements of the customer. 
31 | 
32 | This package also hosts newer features that haven't previously been implemented in existing packages. These include:
33 | 
34 | a. Semantic Document Search 
35 | 
36 | b. Query for key-values using keys
37 | 
38 | c. Table access with numpy indexing
39 | 
40 | d. New export formats with excel, csv and txt
41 | 
42 | e. Indication of duplicated document entities
43 | 
44 | f. Availability of all the above at :class:`Document` and :class:`Page` level.
45 | 
46 | 
47 | .. toctree::
48 |    :maxdepth: 4
49 | 
50 | Usage
51 | =====
52 | .. toctree::
53 |    :maxdepth: 2
54 | 
55 |    installation
56 |    using_in_lambda
57 |    examples
58 |    commandline
59 | 
60 | API Reference
61 | =============
62 | 
63 | .. toctree::
64 |    :maxdepth: 4
65 | 
66 |    textractor
67 |    textractor.parsers
68 |    textractor.entities
69 |    textractor.visualizers
70 |    textractor.data.constants
71 |    textractor.data.text_linearization_config
72 | 
73 | 


--------------------------------------------------------------------------------
/docs/source/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 3
 6 | 
 7 | Official package
 8 | ____________________________________
 9 | 
10 | Textractor is available on PyPI and can be installed with :code:`pip install amazon-textract-textractor`. By default this will install the minimal version of textractor. The following extras can be used to add features:
11 | 
12 | - :code:`pdfium` (:code:`pip install amazon-textract-textractor[pdfium]`) includes :code:`pypdfium2` and is the recommended way to enable PDF rasterization in Textractor. Note that this is **not** necessary to call Textract with a PDF file.
13 | - :code:`pdf` (:code:`pip install amazon-textract-textractor[pdf]`) includes :code:`pdf2image` and is an additional way to enable PDF rasterization in Textractor. Note that this is **not** necessary to call Textract with a PDF file.
14 | - :code:`torch` (:code:`pip install amazon-textract-textractor[torch]`) includes :code:`sentence_transformers` for better word search and matching. This will work on CPU but be noticeably slower than non-machine learning based approaches.
15 | - :code:`dev` (:code:`pip install amazon-textract-textractor[dev]`) includes all the dependencies above and everything else needed to test the code.
16 | 
17 | You can pick several extras by separating the labels with commas like this :code:`pip install amazon-textract-textractor[pdf,torch]`.
18 | 
19 | From Source
20 | ___________
21 | 
22 | To install the package, clone the repository with the following command -
23 | 
24 | :code:`git clone git@github.com:aws-samples/amazon-textract-textractor.git`
25 | 
26 | Navigate into the amazon-textract-textractor directory on the terminal and run these commands.
27 | 
28 | To install requirements :code:`pip install -r requirements.txt`
29 | 
30 | Then install the package with :code:`pip install -e .`
31 | 
32 | Try it out
33 | ___________
34 | 
35 | The :file:`Demo.ipynb` can be used as a reference to understand some functionalities hosted by the package.
36 | Additionally, `docs/tests/notebooks/` have some tutorials you can try out.


--------------------------------------------------------------------------------
/docs/source/notebooks/imgs/excel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/notebooks/imgs/excel.png


--------------------------------------------------------------------------------
/docs/source/notebooks/interfacing_with_trp2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "f3801162",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Interfacing with trp2\n",
  9 |     "\n",
 10 |     "The Textract response parser was the preferred way of handling Textract API output before the release of Textractor. If your current workflow uses the older library, you can easily reuse their functions through the compatibility API.\n",
 11 |     "\n",
 12 |     "## Installation\n",
 13 |     "\n",
 14 |     "To begin, install the `amazon-textract-textractor` package using pip.\n",
 15 |     "\n",
 16 |     "`pip install amazon-textract-textractor`\n",
 17 |     "\n",
 18 |     "There are various sets of dependencies available to tailor your installation to your use case. The base package will have sensible default, but you may want to install the PDF extra dependencies if your workflow uses PDFs with `pip install amazon-textract-textractor[pdfium]`. You can read more on extra dependencies [in the documentation](https://aws-samples.github.io/amazon-textract-textractor/installation.html)\n",
 19 |     "\n",
 20 |     "## Calling Textract"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "id": "47ea794e",
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "from textractor import Textractor\n",
 31 |     "\n",
 32 |     "extractor = Textractor(profile_name=\"default\")\n",
 33 |     "# This path assumes that you are running the notebook from docs/source/notebooks\n",
 34 |     "document = extractor.detect_document_text(\"../../../tests/fixtures/form.png\")"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "id": "7231472c",
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "This document holds the following data:\n",
 47 |        "Pages - 1\n",
 48 |        "Words - 259\n",
 49 |        "Lines - 74\n",
 50 |        "Key-values - 0\n",
 51 |        "Checkboxes - 0\n",
 52 |        "Tables - 0\n",
 53 |        "Identity Documents - 0"
 54 |       ]
 55 |      },
 56 |      "execution_count": 2,
 57 |      "metadata": {},
 58 |      "output_type": "execute_result"
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "document"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "id": "14b4052c",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## Getting the trp2 document\n",
 71 |     "\n",
 72 |     "All `Document` objects have a convenience function `to_trp2()` that is a shorthand for `TDocumentSchema().load(document.response)` and creates a matching trp2 document. Note that this behaves as a converter, not as a proxy so any changes done on the `TDocument` will not be passed to the `Document` object."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "id": "a9b36794",
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "trp2_document = document.to_trp2()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "id": "57e69a22",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "## Conclusion\n",
 91 |     "\n",
 92 |     "Textractor comes with everything you need to reuse components from your current workflow with the newer caller, pretty printer, or directional finder."
 93 |    ]
 94 |   }
 95 |  ],
 96 |  "metadata": {
 97 |   "kernelspec": {
 98 |    "display_name": "Python 3 (ipykernel)",
 99 |    "language": "python",
100 |    "name": "python3"
101 |   },
102 |   "language_info": {
103 |    "codemirror_mode": {
104 |     "name": "ipython",
105 |     "version": 3
106 |    },
107 |    "file_extension": ".py",
108 |    "mimetype": "text/x-python",
109 |    "name": "python",
110 |    "nbconvert_exporter": "python",
111 |    "pygments_lexer": "ipython3",
112 |    "version": "3.10.6"
113 |   }
114 |  },
115 |  "nbformat": 4,
116 |  "nbformat_minor": 5
117 | }
118 | 


--------------------------------------------------------------------------------
/docs/source/overlayer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/overlayer.png


--------------------------------------------------------------------------------
/docs/source/overlayer_bigger.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/overlayer_bigger.png


--------------------------------------------------------------------------------
/docs/source/textractor.data.constants.rst:
--------------------------------------------------------------------------------
1 | Constants
2 | =======================
3 |  
4 | 
5 | .. automodule:: textractor.data.constants
6 |    :members:
7 |    :undoc-members:
8 |    :show-inheritance:
9 | 


--------------------------------------------------------------------------------
/docs/source/textractor.data.text_linearization_config.rst:
--------------------------------------------------------------------------------
1 | TextLinearizationConfig
2 | =======================
3 | 
4 | .. automodule:: textractor.data.text_linearization_config
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/textractor.parsers.rst:
--------------------------------------------------------------------------------
 1 | Entity Parser
 2 | ==========================
 3 | 
 4 | The library is intended to support multiple formats for parsing with a unified underlying object representation. For the Textract customer, 
 5 | the response_parser function has been created to handle API response parsing for `DetectDocumentText <https://docs.aws.amazon.com/textract/latest/dg/API_DetectDocumentText.html>`_, 
 6 | `AnalyzeDocument <https://docs.aws.amazon.com/textract/latest/dg/API_AnalyzeDocument.html>`_, `StartDocumentTextDetection <https://docs.aws.amazon.com/textract/latest/dg/API_StartDocumentTextDetection.html>`_ and 
 7 | `StartDocumentAnalysis <https://docs.aws.amazon.com/textract/latest/dg/API_StartDocumentAnalysis.html>`_.
 8 | 
 9 | response_parser
10 | ---------------
11 | 
12 | .. automodule:: textractor.parsers.response_parser
13 |    :members:
14 |    :undoc-members:
15 |    :show-inheritance:
16 | 


--------------------------------------------------------------------------------
/docs/source/textractor.rst:
--------------------------------------------------------------------------------
1 | Textract Caller
2 | ===============
3 | 
4 | .. automodule:: textractor.textractor
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/textractor.visualizers.rst:
--------------------------------------------------------------------------------
 1 | Entity Visualization
 2 | =====================
 3 | 
 4 | Most features that return :class:`DocumentEntity` objects are of :class:`EntityList` type. It is an extension of the :code:`list` data type
 5 | with the intention of providing visualization features to these entities. 
 6 | 
 7 | EntityList 
 8 | ----------
 9 | 
10 | .. automodule:: textractor.visualizers.entitylist
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | 


--------------------------------------------------------------------------------
/docs/source/textractor_cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/docs/source/textractor_cropped.png


--------------------------------------------------------------------------------
/docs/source/using_in_lambda.rst:
--------------------------------------------------------------------------------
 1 | Using Textractor in AWS Lambda
 2 | ============
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 3
 6 | 
 7 | Textractor uses Pillow for image manipulation which is a compiled dependency (i.e. not pure Python).
 8 | While we encourage you to build your own lambda layers, we received several requests mentioning that the process tedious,
 9 | which is why we also offer precompiled layers as zip files that you can directly upload to lambda.
10 | 
11 | The precompiled layers are rebuilt on release and can be downloaded here https://github.com/aws-samples/amazon-textract-textractor/actions/workflows/lambda_layers.yml.
12 | 
13 | Step-by-step
14 | ------------
15 | 
16 | We provide a step by step through the AWS Console, but note that proceeding with the AWS CLI would also work. For brevity we assume that you already have an existing lambda.
17 | You can find an excellent guide on how to create a lambda function here: 
18 | https://docs.aws.amazon.com/lambda/latest/dg/getting-started.html. Note that your lambda function will need 
19 | to have Textract access. Since we are targeting a wide range of use cases we will use the AmazonTextractFullAccess 
20 | policy. We recommend that you review your lambda function and tailor the permission to your specific use case.
21 | 
22 | 1. Download the precompiled layers from the GitHub Actions workflow. https://github.com/aws-samples/amazon-textract-textractor/actions/workflows/lambda_layers.yml
23 | 
24 |    a. Navigate to the page
25 | 
26 |    b. Click on "Lambda Layers"
27 | 
28 |    .. image:: images/lambda_tutorial/1b.png
29 | 
30 |    c. Scroll to the bottom of the page and download the package that matches your Python installation. Packages with the `-pdfium` suffix contain `pypdfium2` and allow you to process PDF documents. Packages with the `-pdf` suffix contain `pdf2image` and also allow you to process PDF documents, however we recommend using `pypdfium2` as it does not require any OS-level dependencies.
31 | 
32 |    .. image:: images/lambda_tutorial/1c.png
33 | 
34 | 2. In your AWS Console, navigate to "Lambda" and click "Layers" in the sidebar to the left.
35 | 
36 |    .. image:: images/lambda_tutorial/2.png
37 | 
38 |    a. Click "Create layer"
39 | 
40 |    .. image:: images/lambda_tutorial/2a.png
41 | 
42 |    b. Fill-in the form and upload the .zip file you downloaded in step 1.
43 | 
44 |    .. image:: images/lambda_tutorial/2b.png
45 | 
46 |    c. Click "Create"
47 | 
48 | 3. Navigate to your lambda
49 | 
50 |    a. Scroll down and click "Add a layer"
51 | 
52 |    .. image:: images/lambda_tutorial/3a.png
53 | 
54 |    b. Choose "Custom layers" and pick your amazon-textract-textractor layer
55 | 
56 |    c. Click "Add"
57 | 
58 |    .. image:: images/lambda_tutorial/3c.png
59 | 
60 | 4. Update your code to use Textractor
61 | 
62 |    a. If using the `pdf2image` PDF version you have to update the `PATH` and `LD_LIBRARY_PATH` environment variables through the lambda function configuration interface or directly in code with the `os` module: 
63 | 
64 |    .. code-block:: python
65 | 
66 |       os.environ["LD_LIBRARY_PATH"] = f"/opt/python/bin/:{os.environ['LD_LIBRARY_PATH']}"
67 |       os.environ["PATH"] = f"/opt/python/bin/:{os.environ['PATH']}"


--------------------------------------------------------------------------------
/extras/dev.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | pandas
3 | pdf2image>=1.16,<1.17
4 | pytest
5 | lxml
6 | sentence-transformers>=2.2,<2.3
7 | sphinx-rtd-theme>=1.0,<1.1


--------------------------------------------------------------------------------
/extras/docs.txt:
--------------------------------------------------------------------------------
 1 | jupyterlab
 2 | pandas
 3 | pdf2image>=1.16,<1.17
 4 | pytest
 5 | Sphinx>=5.1,<5.2
 6 | nbsphinx>=0.8,<0.9
 7 | sphinx-rtd-theme>=2.0,<3.0
 8 | sphinx-argparse>=0.5.1
 9 | sphinxcontrib-applehelp>=1.0,<1.1
10 | sphinxcontrib-devhelp>=1.0,<1.1
11 | sphinxcontrib-htmlhelp>=2.0,<2.1
12 | sphinxcontrib-jsmath>=1.0,<1.1
13 | sphinxcontrib-qthelp>=1.0,<1.1
14 | sphinxcontrib-serializinghtml>=1.1,<1.2
15 | 


--------------------------------------------------------------------------------
/extras/pandas.txt:
--------------------------------------------------------------------------------
1 | pandas


--------------------------------------------------------------------------------
/extras/pdf.txt:
--------------------------------------------------------------------------------
1 | pdf2image>=1.16,<1.17


--------------------------------------------------------------------------------
/extras/pdfium.txt:
--------------------------------------------------------------------------------
1 | pypdfium2


--------------------------------------------------------------------------------
/extras/torch.txt:
--------------------------------------------------------------------------------
1 | sentence-transformers>=2.2,<2.3


--------------------------------------------------------------------------------
/helper/Manifest.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 
3 | recursive-exclude * __pycache__
4 | recursive-exclude * *.py[co] test_*.py
5 | recursive-include textracthelper/examples *
6 | 
7 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif


--------------------------------------------------------------------------------
/helper/docs/employmentapp_boxed_FORM_CELL_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/docs/employmentapp_boxed_FORM_CELL_.png


--------------------------------------------------------------------------------
/helper/docs/employmentapp_boxed_LINE_TEXT_OVERLAY.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/docs/employmentapp_boxed_LINE_TEXT_OVERLAY.png


--------------------------------------------------------------------------------
/helper/docs/employmentapp_boxed_WORD_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/docs/employmentapp_boxed_WORD_.png


--------------------------------------------------------------------------------
/helper/docs/employmentapp_boxed_WORD_TEXT_OVERLAY.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/docs/employmentapp_boxed_WORD_TEXT_OVERLAY.png


--------------------------------------------------------------------------------
/helper/fonts/Roboto-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/fonts/Roboto-Regular.ttf


--------------------------------------------------------------------------------
/helper/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.0.35
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:textracthelper/_version.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [aliases]
21 | 


--------------------------------------------------------------------------------
/helper/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from setuptools import setup, find_packages
  4 | from setuptools.command.install import install
  5 | 
  6 | 
  7 | def read(fname):
  8 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
  9 | 
 10 | 
 11 | class FontInstaller(install):
 12 | 
 13 |     def run(self):
 14 |         self._copy_fonts()
 15 |         install.run(self)
 16 | 
 17 |     def _copy_fonts(self):
 18 |         try:
 19 |             import shutil
 20 | 
 21 |             if sys.platform == "win32":
 22 |                 # check the windows font repository
 23 |                 # NOTE: must use uppercase WINDIR, to work around bugs in
 24 |                 # 1.5.2's os.environ.get()
 25 |                 windir = os.environ.get("WINDIR")
 26 |                 if windir:
 27 |                     tgt_dir = os.path.join(windir, "fonts")
 28 |             elif sys.platform in ("linux", "linux2"):
 29 |                 lindirs = os.environ.get("XDG_DATA_DIRS", "")
 30 |                 if not lindirs:
 31 |                     # According to the freedesktop spec, XDG_DATA_DIRS should
 32 |                     # default to /usr/share
 33 |                     tgt_dir = "/usr/share/fonts"
 34 |                 else:
 35 |                     lindir = lindirs.split(":")[0]
 36 |                     tgt_dir = os.path.join(lindir, "fonts")
 37 |             elif sys.platform == "darwin":
 38 |                 tgt_dir = os.path.expanduser("~/Library/Fonts")
 39 | 
 40 |             if not os.path.isdir(tgt_dir):
 41 |                 print('WARNING: Could not locate fonts directory. Default font will be used')
 42 |             else:
 43 |                 _src_dir = 'fonts/'
 44 |                 _font_file = 'Roboto-Regular.ttf'
 45 | 
 46 |                 if _font_file not in os.listdir(tgt_dir):
 47 |                     shutil.copyfile(os.path.join(_src_dir, _font_file), os.path.join(tgt_dir, _font_file))
 48 | 
 49 |         except:
 50 |             print('WARNING: An issue occurred while installing the custom fonts. Default font will be used')
 51 | 
 52 | 
 53 | requirements = [
 54 |     'boto3', 'botocore', 'amazon-textract-response-parser>=0.1.40', 'amazon-textract-caller>=0.0.27',
 55 |     'amazon-textract-overlayer>=0.0.10', 'amazon-textract-prettyprinter>=0.1.0', 'Pillow', 'pypdf>=3.1,<4.0'
 56 | ]
 57 | 
 58 | if sys.argv[-1] == 'publish-test':
 59 |     os.system(f"cd {os.path.dirname(__file__)}")
 60 |     os.system('rm -rf dist/ build/ amazon_textract_helper.egg-info/')
 61 |     os.system('python setup.py sdist bdist_wheel')
 62 |     os.system('twine check dist/*')
 63 |     os.system('twine upload --repository pypitest dist/*')
 64 |     sys.exit()
 65 | 
 66 | if sys.argv[-1] == 'publish':
 67 |     os.system(f"cd {os.path.dirname(__file__)}")
 68 |     os.system('rm -rf dist/ build/ amazon_textract_helper.egg-info/')
 69 |     os.system('python setup.py sdist bdist_wheel')
 70 |     os.system('twine check dist/*')
 71 |     os.system('twine upload --repository pypi dist/*')
 72 |     sys.exit()
 73 | 
 74 | setup(name='amazon-textract-helper',
 75 |       packages=find_packages(exclude=['tests']),
 76 |       include_package_data=True,
 77 |       exclude_package_data={"": ["test_*.py", "__pycache__"]},
 78 |       version='0.0.35',
 79 |       description='Amazon Textract Helper tools',
 80 |       install_requires=requirements,
 81 |       scripts=['bin/amazon-textract'],
 82 |       long_description_content_type='text/markdown',
 83 |       long_description=read('README.md'),
 84 |       author='Amazon Rekognition Textract Demoes',
 85 |       author_email='rekognition-textract-demos@amazon.com',
 86 |       url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/helper',
 87 |       keywords='amazon-textract-textractor amazon textract textractor helper',
 88 |       license="Apache License Version 2.0",
 89 |       classifiers=[
 90 |           "Development Status :: 4 - Beta",
 91 |           "Topic :: Utilities",
 92 |           'License :: OSI Approved :: Apache Software License',
 93 |           'Programming Language :: Python :: 3.6',
 94 |           'Programming Language :: Python :: 3.7',
 95 |           'Programming Language :: Python :: 3.8',
 96 |           'Programming Language :: Python :: 3.9',
 97 |           'Programming Language :: Python :: 3.10',
 98 |       ],
 99 |       cmdclass={'install': FontInstaller},
100 |       python_requires='>=3.6')
101 | 


--------------------------------------------------------------------------------
/helper/textracthelper/.gitignore:
--------------------------------------------------------------------------------
1 | test*


--------------------------------------------------------------------------------
/helper/textracthelper/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import __version__
2 | 
3 | import logging
4 | from logging import NullHandler
5 | 
6 | logging.getLogger(__name__).addHandler(NullHandler())
7 | 


--------------------------------------------------------------------------------
/helper/textracthelper/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.35'
2 | 


--------------------------------------------------------------------------------
/helper/textracthelper/examples/employmentapp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/helper/textracthelper/examples/employmentapp.png


--------------------------------------------------------------------------------
/idp_cdk_manifest/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | __pycache__
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | env/
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | .idea/*
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # dotenv
 88 | .env
 89 | 
 90 | # virtualenv
 91 | .venv
 92 | venv/
 93 | ENV/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
108 | # remove test with fixed input-document locations on S3 till we add a Textract and S3 mock
109 | test_local*
110 | 
111 | share/python-wheels/
112 | *.egg-info/
113 | .installed.cfg
114 | *.egg
115 | MANIFEST
116 | .#
117 | 
118 | .dir-locals.el
119 | 
120 | .vscode
121 | 
122 | .envrc
123 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/Manifest.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 
3 | recursive-exclude * __pycache__
4 | recursive-exclude * *.py[co] test_*.py
5 | 
6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif


--------------------------------------------------------------------------------
/idp_cdk_manifest/README.md:
--------------------------------------------------------------------------------
1 | just bla for now
2 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/output.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/idp_cdk_manifest/output.tar.gz


--------------------------------------------------------------------------------
/idp_cdk_manifest/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.0.1
 3 | commit = False
 4 | tag = False
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:textractmanifest/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [aliases]
21 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from setuptools import setup, find_packages
 4 | 
 5 | 
 6 | def read(fname):
 7 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 8 | 
 9 | 
10 | requirements = ['marshmallow']
11 | 
12 | if sys.argv[-1] == 'publish-test':
13 |     os.system(f"cd {os.path.dirname(__file__)}")
14 |     os.system('rm -rf dist/ build/ idp-cdk-manifest.egg-info/')
15 |     os.system('python setup.py sdist bdist_wheel')
16 |     os.system('twine check dist/*')
17 |     os.system('twine upload --repository pypitest dist/*')
18 |     sys.exit()
19 | 
20 | if sys.argv[-1] == 'publish':
21 |     os.system(f"cd {os.path.dirname(__file__)}")
22 |     os.system('rm -rf dist/ build/ idp-cdk-manifest.egg-info/')
23 |     os.system('python setup.py sdist bdist_wheel')
24 |     os.system('twine check dist/*')
25 |     os.system('twine upload --repository pypi dist/*')
26 |     sys.exit()
27 | 
28 | setup(name='amazon-textract-idp-cdk-manifest',
29 |       packages=find_packages(exclude=['tests']),
30 |       include_package_data=True,
31 |       exclude_package_data={"": ["test_*.py", "__pycache__"]},
32 |       version='0.0.2',
33 |       description='Amazon Textract IDP CDK Manifest',
34 |       install_requires=requirements,
35 |       extras_require={'testing': ['pytest']},
36 |       long_description_content_type='text/markdown',
37 |       long_description=read('README.md'),
38 |       author='Amazon Rekognition Textract Demoes',
39 |       author_email='rekognition-textract-demos@amazon.com',
40 |       url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/idp_cdk_manifest',
41 |       keywords='textract manifest',
42 |       license="Apache License Version 2.0",
43 |       classifiers=[
44 |           "Development Status :: 4 - Beta",
45 |           "Topic :: Utilities",
46 |           'License :: OSI Approved :: Apache Software License',
47 |           'Programming Language :: Python :: 3.7',
48 |           'Programming Language :: Python :: 3.8',
49 |           'Programming Language :: Python :: 3.9',
50 |           'Programming Language :: Python :: 3.10',
51 |           'Programming Language :: Python :: 3.11',
52 |           'Programming Language :: Python :: 3.12',
53 |       ],
54 |       python_requires='>=3.7')
55 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/tests/data/analyze_id.json:
--------------------------------------------------------------------------------
1 | {
2 |     "documentPages":[
3 |         "s3://amazon-textract-public-content/blogs/employeeapp20210510.png",
4 |         "s3://amazon-textract-public-content/blogs/employeeapp20210510.png"
5 |     ],
6 |     "classification": "ID_DOCUMENT"
7 | }
8 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/tests/data/manifest_all_features.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png",
 3 |     "textractFeatures": [
 4 |         "FORMS",
 5 |         "TABLES",
 6 |         "QUERIES",
 7 |         "SIGNATURE"
 8 |     ],
 9 |     "queriesConfig": [{
10 |         "text": "What is the applicant full name?",
11 |         "alias": "FULL_NAME",
12 |         "pages": ["*"]
13 |     }],
14 |     "classification": "EMPLOYMENT_APPLICATION",
15 |     "metaData": [{
16 |         "key": "meta_data_key_1",
17 |         "value": "meta_data_value_1"
18 |     },{
19 |         "key": "meta_data_key_2",
20 |         "value": "meta_data_value_2"
21 |     }]
22 | }
23 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/tests/data/manifest_default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "textractFeatures": [
 3 |         "QUERIES"
 4 |     ],
 5 |     "queriesConfig": [{
 6 |         "text": "What is the applicant full name?",
 7 |         "alias": "FULL_NAME",
 8 |         "pages": ["*"]
 9 |     }]
10 | }
11 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/tests/data/manifest_minimal.json:
--------------------------------------------------------------------------------
1 | {
2 |     "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png"
3 | }
4 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/tests/data/manifest_queries_no_alias.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png",
 3 |     "textractFeatures": [
 4 |         "QUERIES"
 5 |     ],
 6 |     "queriesConfig": [{
 7 |             "text": "What is the applicant full name?"
 8 |         },
 9 |         {
10 |             "text": "What is the applicant last name?"
11 |         }
12 |     ]
13 | }
14 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/tests/data/manifest_queries_no_pages.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png",
 3 |     "textractFeatures": [
 4 |         "QUERIES"
 5 |     ],
 6 |     "queriesConfig": [{
 7 |             "text": "What is the applicant full name?",
 8 |             "alias": "FULL_NAME"
 9 |         },
10 |         {
11 |             "text": "What is the applicant last name?",
12 |             "alias": "LAST_NAME"
13 |         }
14 |     ]
15 | }
16 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/tests/data/manifest_with_classification.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png",
 3 |     "textractFeatures": [
 4 |         "FORMS",
 5 |         "TABLES",
 6 |         "QUERIES"
 7 |     ],
 8 |     "queriesConfig": [{
 9 |         "text": "What is the applicant full name?",
10 |         "alias": "FULL_NAME",
11 |         "pages": ["*"]
12 |     }],
13 |     "classification":"EMPLOYMENT_APPLICATION"
14 | }
15 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/tests/data/manifest_with_classification_and_metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png",
 3 |     "textractFeatures": [
 4 |         "FORMS",
 5 |         "TABLES",
 6 |         "QUERIES"
 7 |     ],
 8 |     "queriesConfig": [{
 9 |         "text": "What is the applicant full name?",
10 |         "alias": "FULL_NAME",
11 |         "pages": ["*"]
12 |     }],
13 |     "classification":"EMPLOYMENT_APPLICATION",
14 |     "metaData":[
15 |         {"key": "key1", "value": "value1"},
16 |         {"key": "key2", "value": "value2"}
17 |     ]
18 | }
19 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/tests/data/queries_forms.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png",
 3 |     "textractFeatures": [
 4 |         "FORMS", "QUERIES"
 5 |     ],
 6 |     "queriesConfig": [{
 7 |             "text": "What is the applicant full name?",
 8 |             "alias": "FULL_NAME"
 9 |         },
10 |         {
11 |             "text": "What is the applicant last name?",
12 |             "alias": "LAST_NAME"
13 |         }
14 |     ]
15 | }
16 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/tests/data/simple_feature_manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png",
 3 |     "textractFeatures": [
 4 |         "FORMS",
 5 |         "TABLES",
 6 |         "QUERIES"
 7 |     ],
 8 |     "queriesConfig": [{
 9 |         "text": "What is the applicant full name?",
10 |         "alias": "FULL_NAME",
11 |         "pages": ["*"]
12 |     }]
13 | }
14 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/textractmanifest/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from logging import NullHandler
3 | from .manifest import IDPManifest as IDPManifest, IDPManifestSchema as IDPManifestSchema, Query as Query, QuerySchema as QuerySchema, MetaData as MetaData, MetaDataSchema as MetaDataSchema
4 | 
5 | logging.getLogger('tidpmanifest').addHandler(NullHandler())
6 | 
7 | __version__ = '0.0.1'
8 | 


--------------------------------------------------------------------------------
/idp_cdk_manifest/textractmanifest/manifest.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | import marshmallow as m
  3 | import logging
  4 | from typing import List
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | 
  9 | class BaseSchema(m.Schema):
 10 |     """
 11 |     skip null values when generating JSON
 12 |     https://github.com/marshmallow-code/marshmallow/issues/229#issuecomment-134387999
 13 |     """
 14 |     SKIP_VALUES = set([None])
 15 | 
 16 |     @m.post_dump
 17 |     def remove_skip_values(self, data, many, pass_many=False):
 18 |         return {
 19 |             key: value
 20 |             for key, value in data.items()
 21 |             if isinstance(value, (dict, list, set, tuple, range,
 22 |                                   frozenset)) or value not in self.SKIP_VALUES
 23 |         }
 24 | 
 25 | 
 26 | @dataclass
 27 | class MetaData():
 28 |     key: str
 29 |     value: str
 30 | 
 31 | 
 32 | @dataclass
 33 | class Query():
 34 |     text: str
 35 |     alias: str = field(default=None)  #type: ignore
 36 |     pages: List[str] = field(default=None)  #type: ignore
 37 | 
 38 | 
 39 | @dataclass
 40 | class IDPManifest():
 41 |     s3_path: str = field(default=None)  #type: ignore
 42 |     document_pages: List[str] = field(default=None)  #type: ignore
 43 |     queries_config: List[Query] = field(default=None)  #type: ignore
 44 |     textract_features: List[str] = field(default=None)  #type: ignore
 45 |     classification: str = field(default=None)  #type: ignore
 46 |     meta_data: List[MetaData] = field(default=None)  #type: ignore
 47 | 
 48 |     def merge(self, manifest: 'IDPManifest'):
 49 |         ''' add values top level from the passed in manifest when not defined in the manifest itself.
 50 |         TODO: implement proper merging with joining arrays for example'''
 51 |         if manifest.s3_path and not self.s3_path:
 52 |             self.s3_path = manifest.s3_path
 53 |         if manifest.document_pages and not self.document_pages:
 54 |             self.document_pages = manifest.document_pages
 55 |         if manifest.queries_config and not self.queries_config:
 56 |             self.queries_config = manifest.queries_config
 57 |         if manifest.textract_features and not self.textract_features:
 58 |             self.textract_features = manifest.textract_features
 59 |         if manifest.meta_data and not self.meta_data:
 60 |             self.meta_data = manifest.meta_data
 61 | 
 62 | 
 63 | class MetaDataSchema(BaseSchema):
 64 |     key = m.fields.String(data_key="key", required=True)
 65 |     value = m.fields.String(data_key="value", required=False)
 66 | 
 67 |     @m.post_load
 68 |     def make_query(self, data, **kwargs):
 69 |         return MetaData(**data)
 70 | 
 71 | 
 72 | class QuerySchema(BaseSchema):
 73 |     text = m.fields.String(data_key="text", required=True)
 74 |     alias = m.fields.String(data_key="alias", required=False)
 75 |     pages = m.fields.List(m.fields.String, data_key="pages", required=False)
 76 | 
 77 |     @m.post_load
 78 |     def make_query(self, data, **kwargs):
 79 |         return Query(**data)
 80 | 
 81 | 
 82 | class IDPManifestSchema(BaseSchema):
 83 |     queries_config = m.fields.List(m.fields.Nested(QuerySchema),
 84 |                                    data_key="queriesConfig",
 85 |                                    required=False)
 86 |     textract_features = m.fields.List(m.fields.String,
 87 |                                       data_key="textractFeatures",
 88 |                                       required=False)
 89 |     s3_path = m.fields.String(data_key="s3Path", required=False)
 90 |     classification = m.fields.String(data_key="classification", required=False)
 91 |     document_pages = m.fields.List(m.fields.String,
 92 |                                    data_key="documentPages",
 93 |                                    required=False)
 94 |     meta_data = m.fields.List(m.fields.Nested(MetaDataSchema),
 95 |                               data_key="metaData",
 96 |                               required=False)
 97 | 
 98 |     @m.post_load
 99 |     def make_queries_config(self, data, **kwargs):
100 |         return IDPManifest(**data)
101 | 


--------------------------------------------------------------------------------
/images/amzn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/images/amzn.png


--------------------------------------------------------------------------------
/overlayer/Manifest.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 
3 | recursive-exclude * __pycache__
4 | recursive-exclude * *.py[co] test_*.py
5 | 
6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif


--------------------------------------------------------------------------------
/overlayer/README.md:
--------------------------------------------------------------------------------
 1 | # Textract-Overlayer
 2 | 
 3 | amazon-textract-overlayer provides functions to help overlay bounding boxes on documents.
 4 | 
 5 | # Install
 6 | 
 7 | ```bash
 8 | > python -m pip install amazon-textract-overlayer
 9 | ```
10 | 
11 | Make sure your environment is setup with AWS credentials through configuration files or environment variables or an attached role. (https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
12 | 
13 | # Samples
14 | 
15 | Primary method provided is get_bounding_boxes which returns bounding boxes based on the Textract_Type passed in.
16 | Mostly taken from the ```amazon-textract``` command from the package ```amazon-textract-helper```.
17 | 
18 | This will return the bounding boxes for WORD and CELL data types.
19 | 
20 | ```python
21 | from textractoverlayer.t_overlay import DocumentDimensions, get_bounding_boxes
22 | from textractcaller.t_call import Textract_Features, Textract_Types, call_textract
23 | 
24 | doc = call_textract(input_document=input_document, features=features)
25 | # image is a PIL.Image.Image in this case
26 | document_dimension:DocumentDimensions = DocumentDimensions(doc_width=image.size[0], doc_height=image.size[1])
27 | overlay=[Textract_Types.WORD, Textract_Types.CELL]
28 | 
29 | bounding_box_list = get_bounding_boxes(textract_json=doc, document_dimensions=document_dimension, overlay_features=overlay)
30 | ```
31 | 
32 | The actual overlay drawing of bounding boxes for images is in the ```amazon-textract``` command from the package ```amazon-textract-helper``` and looks like this:
33 | 
34 | ```python
35 | from PIL import Image, ImageDraw
36 | 
37 | image = Image.open(input_document)
38 | rgb_im = image.convert('RGB')
39 | draw = ImageDraw.Draw(rgb_im)
40 | 
41 | # check the impl in amazon-textract-helper for ways to associate different colors to types
42 | for bbox in bounding_box_list:
43 |     draw.rectangle(xy=[bbox.xmin, bbox.ymin, bbox.xmax, bbox.ymax], outline=(128, 128, 0), width=2)
44 | 
45 | rgb_im.show()
46 | ```
47 | 
48 | The draw bounding boxes within PDF documents the following code can be used:
49 | 
50 | ```python
51 | import fitz
52 | 
53 | # for local stored files
54 | file_path = "<<replace with the local path to your pdf file>>"
55 | doc = fitz.open(file_path)
56 | # for files stored in S3 the streaming object can be used
57 | # doc = fitz.open(stream="<<replace with stream_object_variable>>", filetype="pdf")
58 | 
59 | # draw boxes
60 | for p, page in enumerate(doc):
61 |     p += 1
62 |     for bbox in bounding_box_list:
63 |         if bbox.page_number == p:
64 |             page.draw_rect(
65 |                 [bbox.xmin, bbox.ymin, bbox.xmax, bbox.ymax], color=(0, 1, 0), width=2
66 |             )
67 | 
68 | # save file locally 
69 | doc.save("<<local path for output file>>")
70 | 
71 | ```


--------------------------------------------------------------------------------
/overlayer/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.0.12
 3 | commit = False
 4 | tag = False
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:textractoverlayer/_version.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [aliases]
21 | 


--------------------------------------------------------------------------------
/overlayer/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from setuptools import setup, find_packages
 4 | 
 5 | 
 6 | def read(fname):
 7 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 8 | 
 9 | 
10 | requirements = ['boto3', 'botocore', 'amazon-textract-caller>=0.0.11', 'Pillow', 'pypdf>=3.1,<5.0']
11 | 
12 | if sys.argv[-1] == 'publish-test':
13 |     os.system(f"cd {os.path.dirname(__file__)}")
14 |     os.system('rm -rf dist/ build/ amazon_textract_overlayer.egg-info/')
15 |     os.system('python setup.py sdist bdist_wheel')
16 |     os.system('twine check dist/*')
17 |     os.system('twine upload --repository pypitest dist/*')
18 |     sys.exit()
19 | 
20 | if sys.argv[-1] == 'publish':
21 |     os.system(f"cd {os.path.dirname(__file__)}")
22 |     os.system('rm -rf dist/ build/ amazon_textract_overlayer.egg-info/')
23 |     os.system('python setup.py sdist bdist_wheel')
24 |     os.system('twine check dist/*')
25 |     os.system('twine upload --repository pypi dist/*')
26 |     sys.exit()
27 | 
28 | setup(name='amazon-textract-overlayer',
29 |       packages=find_packages(exclude=['tests']),
30 |       include_package_data=True,
31 |       exclude_package_data={"": ["test_*.py", "__pycache__"]},
32 |       version='0.0.13',
33 |       description='Amazon Textract Overlay tools',
34 |       install_requires=requirements,
35 |       long_description_content_type='text/markdown',
36 |       long_description=read('README.md'),
37 |       author='Amazon Rekognition Textract Demoes',
38 |       author_email='rekognition-textract-demos@amazon.com',
39 |       url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/overlayer',
40 |       keywords='amazon-textract-textractor amazon textract textractor helper overlayer',
41 |       license="Apache License Version 2.0",
42 |       classifiers=[
43 |           "Development Status :: 4 - Beta",
44 |           "Topic :: Utilities",
45 |           'License :: OSI Approved :: Apache Software License',
46 |           'Programming Language :: Python :: 3.6',
47 |           'Programming Language :: Python :: 3.7',
48 |           'Programming Language :: Python :: 3.8',
49 |           'Programming Language :: Python :: 3.9',
50 |           'Programming Language :: Python :: 3.10',
51 |           'Programming Language :: Python :: 3.11',
52 |           'Programming Language :: Python :: 3.12',
53 |       ],
54 |       python_requires='>=3.6')
55 | 


--------------------------------------------------------------------------------
/overlayer/tests/data/Amazon-Textract-Pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/overlayer/tests/data/Amazon-Textract-Pdf.pdf


--------------------------------------------------------------------------------
/overlayer/tests/test_overlayer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import textractoverlayer.image_tools as it
 4 | 
 5 | 
 6 | def test_overlayer_pdf_dimensions(caplog):
 7 |     caplog.set_level(logging.DEBUG)
 8 |     SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 9 |     input_filename = os.path.join(SCRIPT_DIR, "data/Amazon-Textract-Pdf.pdf")
10 |     dimensions = it.get_width_height_from_file(input_filename)
11 |     assert dimensions.doc_height == 792
12 |     assert dimensions.doc_width == 612
13 | 


--------------------------------------------------------------------------------
/overlayer/textractoverlayer/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import __version__
2 | 
3 | import logging
4 | from logging import NullHandler
5 | 
6 | logging.getLogger(__name__).addHandler(NullHandler())
7 | 


--------------------------------------------------------------------------------
/overlayer/textractoverlayer/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.12'
2 | 
3 | 


--------------------------------------------------------------------------------
/overlayer/textractoverlayer/image_tools.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import sys
 4 | from textractoverlayer.t_overlay import DocumentDimensions
 5 | import boto3
 6 | 
 7 | # Conditionally add /opt to the PYTHON PATH
 8 | if os.getenv('AWS_EXECUTION_ENV') is not None:
 9 |     sys.path.append('/opt')
10 | 
11 | from PIL import Image
12 | from pypdf import PdfReader
13 | 
14 | pdf_suffixes = ['.pdf']
15 | image_suffixes = ['.png', '.jpg', '.jpeg']
16 | supported_suffixes = pdf_suffixes + image_suffixes
17 | 
18 | 
19 | def get_size_from_filestream(fs, ext) -> DocumentDimensions:
20 |     if ext in image_suffixes:
21 |         img = Image.open(fs)
22 |         return DocumentDimensions(doc_width=img.width, doc_height=img.height)
23 |     else:
24 |         input1 = PdfReader(fs)
25 |         pdf_page = input1.pages[0].mediabox
26 |         return DocumentDimensions(doc_width=int(pdf_page[2]), doc_height=int(pdf_page[3]))
27 | 
28 | 
29 | def get_size_from_s3(s3_bucket, s3_key) -> DocumentDimensions:
30 |     _, ext = os.path.splitext(s3_key)
31 |     if ext in supported_suffixes:
32 |         s3 = boto3.client('s3')
33 |         o = s3.get_object(Bucket=s3_bucket, Key=s3_key)
34 |         input_bytes = o.get('Body').read()
35 |         f = io.BytesIO(input_bytes)
36 |         return get_size_from_filestream(f, ext)
37 |     else:
38 |         raise ValueError(f'{s3_key} not in {supported_suffixes}')
39 | 
40 | 
41 | def get_filename_from_document(input_document: str):
42 |     file_name = ''
43 |     if len(input_document) > 7 and input_document.lower().startswith('s3://'):
44 |         input_document = input_document.replace('s3://', '')
45 |         _, s3_key = input_document.split('/', 1)
46 |         file_name, suffix = os.path.splitext(os.path.basename(s3_key))
47 |     else:
48 |         file_name, suffix = os.path.splitext(os.path.basename(input_document))
49 |     return file_name, suffix
50 | 
51 | 
52 | def get_size_from_document(input_document: str) -> DocumentDimensions:
53 |     if len(input_document) > 7 and input_document.lower().startswith('s3://'):
54 |         input_document = input_document.replace('s3://', '')
55 |         s3_bucket, s3_key = input_document.split('/', 1)
56 |         return get_size_from_s3(s3_bucket=s3_bucket, s3_key=s3_key)
57 |     else:
58 |         return get_size_from_document(input_document)
59 | 
60 | 
61 | def get_width_height_from_s3_object(s3_bucket, s3_key) -> DocumentDimensions:
62 |     return get_size_from_s3(s3_bucket, s3_key)
63 | 
64 | 
65 | def get_width_height_from_file(filepath) -> DocumentDimensions:
66 |     _, ext = os.path.splitext(filepath)
67 |     if ext in supported_suffixes:
68 |         with open(filepath, 'rb') as input_fs:
69 |             return get_size_from_filestream(input_fs, ext)
70 |     else:
71 |         raise ValueError(f'{filepath} not in {supported_suffixes}')
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     import argparse
76 | 
77 |     parser = argparse.ArgumentParser()
78 |     parser.add_argument('--s3-bucket', required=True)
79 |     parser.add_argument('--s3-key', required=True)
80 |     args = parser.parse_args()
81 |     s3_bucket = args.s3_bucket
82 |     s3_key = args.s3_key
83 | 
84 |     print(get_width_height_from_s3_object(s3_bucket, s3_key))
85 | 


--------------------------------------------------------------------------------
/prettyprinter/Manifest.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 
3 | recursive-exclude * __pycache__
4 | recursive-exclude * *.py[co] test_*.py
5 | 
6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif


--------------------------------------------------------------------------------
/prettyprinter/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.1.9
 3 | commit = False
 4 | tag = False
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:textractprettyprinter/_version.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [aliases]
21 | 


--------------------------------------------------------------------------------
/prettyprinter/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from setuptools import setup
 4 | 
 5 | 
 6 | def read(fname):
 7 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 8 | 
 9 | 
10 | requirements = ['boto3>=1,<2', 'botocore', 'amazon-textract-response-parser>=0.1,<0.2', 'tabulate>=0.9,<0.10']
11 | 
12 | if sys.argv[-1] == 'publish-test':
13 |     os.system(f"cd {os.path.dirname(__file__)}")
14 |     os.system('rm -rf dist/ build/ amazon_textract_prettyprinter.egg-info/')
15 |     os.system('python setup.py sdist bdist_wheel')
16 |     os.system('twine check dist/*')
17 |     os.system('twine upload --repository pypitest dist/*')
18 |     sys.exit()
19 | 
20 | if sys.argv[-1] == 'publish':
21 |     os.system(f"cd {os.path.dirname(__file__)}")
22 |     os.system('rm -rf dist/ build/ amazon_textract_prettyprinter.egg-info/')
23 |     os.system('python setup.py sdist bdist_wheel')
24 |     os.system('twine check dist/*')
25 |     os.system('twine upload --repository pypi dist/*')
26 |     sys.exit()
27 | 
28 | setup(name='amazon-textract-prettyprinter',
29 |       packages=['textractprettyprinter'],
30 |       version='0.1.10',
31 |       description='Amazon Textract Helper tools for pretty printing',
32 |       install_requires=requirements,
33 |       long_description_content_type='text/markdown',
34 |       long_description=read('README.md'),
35 |       author='Amazon Rekognition Textract Demoes',
36 |       author_email='rekognition-textract-demos@amazon.com',
37 |       url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/prettyprinter',
38 |       keywords='amazon-textract-textractor amazon textract textractor helper pretty-print',
39 |       license="Apache License Version 2.0",
40 |       classifiers=[
41 |           "Development Status :: 4 - Beta",
42 |           "Topic :: Utilities",
43 |           'License :: OSI Approved :: Apache Software License',
44 |           'Programming Language :: Python :: 3.6',
45 |           'Programming Language :: Python :: 3.7',
46 |           'Programming Language :: Python :: 3.8',
47 |           'Programming Language :: Python :: 3.9',
48 |           'Programming Language :: Python :: 3.10',
49 |           'Programming Language :: Python :: 3.11',
50 |           'Programming Language :: Python :: 3.12',
51 |       ],
52 |       python_requires='>=3.6')
53 | 


--------------------------------------------------------------------------------
/prettyprinter/textractprettyprinter/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import __version__
2 | 
3 | from .t_pretty_print import Pretty_Print_Table_Format as Pretty_Print_Table_Format
4 | from .t_pretty_print_layout import get_layout_csv_from_trp2 as get_layout_csv_from_trp2
5 | import logging
6 | from logging import NullHandler
7 | 
8 | logging.getLogger(__name__).addHandler(NullHandler())
9 | 


--------------------------------------------------------------------------------
/prettyprinter/textractprettyprinter/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.9'
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | amazon-textract-caller>=0.2.4,<1
2 | Pillow
3 | tabulate>=0.9,<0.10
4 | XlsxWriter>=3.0,<4
5 | editdistance>=0.6.2,<0.9
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | long_description = file:
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | import setuptools
 5 | from setuptools import find_packages, setup
 6 | from os import path
 7 | 
 8 | here = path.abspath(path.dirname(__file__))
 9 | 
10 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
11 |     long_description = f.read()
12 | 
13 | 
14 | def read_requirements(path):
15 |     with open(path, "r") as f:
16 |         requirements = [line for line in f.readlines()]
17 |     return requirements
18 | 
19 | class TestCommand(setuptools.Command):
20 | 
21 |     description = 'run linters, tests and create a coverage report'
22 |     user_options = []
23 | 
24 |     def initialize_options(self):
25 |         pass
26 | 
27 |     def finalize_options(self):
28 |         pass
29 | 
30 |     def run(self):
31 |         #self._run(['pytest', 'tests/'])
32 |         return
33 | 
34 |     def _run(self, command):
35 |         try:
36 |             subprocess.check_call(command)
37 |         except subprocess.CalledProcessError as error:
38 |             print('Command failed with exit code', error.returncode)
39 |             sys.exit(error.returncode)
40 | 
41 | setup(
42 |     # include data files
43 |     name="amazon-textract-textractor",
44 |     version="1.9.2",
45 |     license="Apache 2.0",
46 |     description="A package to use AWS Textract services.",
47 |     url="https://github.com/aws-samples/amazon-textract-textractor",
48 |     long_description=long_description,
49 |     long_description_content_type="text/markdown",
50 |     classifiers=[
51 |         "Programming Language :: Python :: 3",
52 |         "Programming Language :: Python :: 3.9",
53 |         "Programming Language :: Python :: 3.10",
54 |         "Programming Language :: Python :: 3.11",
55 |         "Programming Language :: Python :: 3.12",
56 |     ],
57 |     keywords="amazon textract aws ocr document",
58 |     packages=find_packages(exclude=["docs", "tests"], ),
59 |     include_package_data=True,
60 |     install_requires=read_requirements(os.path.join(here, "requirements.txt")),
61 |     extras_require={
62 |         f.split(".")[0]: read_requirements(os.path.join(here, "extras", f))
63 |         for f in os.listdir(os.path.join(here, "extras"))
64 |     },
65 |     cmdclass={'test': TestCommand},
66 |     test_command="test",
67 |     entry_points={
68 |         "console_scripts": [
69 |             "textractor = textractor.cli.cli:textractor_cli",
70 |         ],
71 |     },
72 | )
73 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/__init__.py


--------------------------------------------------------------------------------
/tests/fixtures/amzn_q2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/amzn_q2.png


--------------------------------------------------------------------------------
/tests/fixtures/fake_id.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/fake_id.png


--------------------------------------------------------------------------------
/tests/fixtures/form.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/form.png


--------------------------------------------------------------------------------
/tests/fixtures/form_1005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/form_1005.png


--------------------------------------------------------------------------------
/tests/fixtures/in-table-title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/in-table-title.png


--------------------------------------------------------------------------------
/tests/fixtures/invalid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/invalid.pdf


--------------------------------------------------------------------------------
/tests/fixtures/invoice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/invoice.png


--------------------------------------------------------------------------------
/tests/fixtures/matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/matrix.png


--------------------------------------------------------------------------------
/tests/fixtures/multiline_cells.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/multiline_cells.jpeg


--------------------------------------------------------------------------------
/tests/fixtures/patient_intake_form_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/patient_intake_form_sample.png


--------------------------------------------------------------------------------
/tests/fixtures/paystub.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/paystub.jpg


--------------------------------------------------------------------------------
/tests/fixtures/paystub_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/paystub_header.png


--------------------------------------------------------------------------------
/tests/fixtures/paystub_single_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/paystub_single_table.png


--------------------------------------------------------------------------------
/tests/fixtures/paystub_tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/paystub_tables.png


--------------------------------------------------------------------------------
/tests/fixtures/reading_order.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/reading_order.pdf


--------------------------------------------------------------------------------
/tests/fixtures/receipt.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/receipt.jpg


--------------------------------------------------------------------------------
/tests/fixtures/receipt_no_summary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/receipt_no_summary.png


--------------------------------------------------------------------------------
/tests/fixtures/resume.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/resume.png


--------------------------------------------------------------------------------
/tests/fixtures/sample-invoice.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/sample-invoice.pdf


--------------------------------------------------------------------------------
/tests/fixtures/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/screenshot.png


--------------------------------------------------------------------------------
/tests/fixtures/signature.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/signature.jpg


--------------------------------------------------------------------------------
/tests/fixtures/single-page-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/single-page-1.png


--------------------------------------------------------------------------------
/tests/fixtures/single-page-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/single-page-2.png


--------------------------------------------------------------------------------
/tests/fixtures/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/test.png


--------------------------------------------------------------------------------
/tests/fixtures/textractor-multipage-doc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/textractor-multipage-doc.pdf


--------------------------------------------------------------------------------
/tests/fixtures/textractor-singlepage-doc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/textractor-singlepage-doc.pdf


--------------------------------------------------------------------------------
/tests/fixtures/titanic.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/titanic.webp


--------------------------------------------------------------------------------
/tests/fixtures/tutorial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/tutorial.pdf


--------------------------------------------------------------------------------
/tests/fixtures/vbat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/vbat.png


--------------------------------------------------------------------------------
/tests/fixtures/vbat2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/fixtures/vbat2.png


--------------------------------------------------------------------------------
/tests/invoice_sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tests/invoice_sample.pdf


--------------------------------------------------------------------------------
/tests/test_analyze_expense.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import PIL
 4 | import unittest
 5 | from textractor import Textractor
 6 | from textractor.entities.document import Document
 7 | from textractor.data.constants import TextractFeatures
 8 | from textractor.exceptions import InvalidProfileNameError, NoImageException, S3FilePathMissing
 9 | 
10 | from .utils import get_fixture_path
11 | 
12 | class TestTextractorAnalyzeExpense(unittest.TestCase):
13 |     def setUp(self):
14 |         # insert credentials and filepaths here to run test
15 |         self.profile_name = "default"
16 |         self.current_directory = os.path.abspath(os.path.dirname(__file__))
17 |         self.image_path = os.path.join(self.current_directory, "fixtures/receipt.jpg")
18 |         self.image = PIL.Image.open(self.image_path)
19 | 
20 |         if self.profile_name is None:
21 |             raise InvalidProfileNameError(
22 |                 "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py."
23 |             )
24 |         if os.environ.get("CALL_TEXTRACT"):
25 |             self.extractor = Textractor(
26 |                 profile_name=self.profile_name, kms_key_id=""
27 |             )
28 | 
29 |     def test_analyze_expense_from_path(self):
30 |         # Testing local single image input
31 |         if os.environ.get("CALL_TEXTRACT"):
32 |             document = self.extractor.analyze_expense(file_source=self.image_path)
33 |             with open(get_fixture_path(), "w") as f:
34 |                 json.dump(document.response, f)
35 |         else:
36 |             document = Document.open(get_fixture_path())
37 | 
38 |         self.assertIsInstance(document, Document)
39 |         self.assertEqual(len(document.pages), 1)
40 |         self.assertEqual(document.expense_documents[0].summary_fields.TOTAL[0].value.text, "$1810.46")
41 |         self.assertEqual(len(document.expense_documents[0].summary_groups.VENDOR), 2)
42 |         self.assertEqual(len(document.expense_documents[0].line_items_groups[0].to_pandas()), 4,
43 |                          "There are 4 line item in the receipts")
44 | 
45 |     def test_analyze_expense_from_image(self):
46 |         # Testing local single image input
47 |         if os.environ.get("CALL_TEXTRACT"):
48 |             document = self.extractor.analyze_expense(file_source=self.image)
49 |             with open(get_fixture_path(), "w") as f:
50 |                 json.dump(document.response, f)
51 |         else:
52 |             document = Document.open(get_fixture_path())
53 | 
54 |         self.assertIsInstance(document, Document)
55 |         self.assertEqual(len(document.pages), 1)
56 |         self.assertEqual(document.expense_documents[0].summary_fields.TOTAL[0].value.text, "$1810.46")
57 |         self.assertEqual(len(document.expense_documents[0].summary_groups.VENDOR), 2)
58 |         self.assertEqual(len(document.expense_documents[0].line_items_groups[0].to_pandas()), 4,
59 |                          "There are 4 line item in the receipts")
60 | 
61 | 
62 | class TestTextractorAnalyzeExpenseNoSummary(unittest.TestCase):
63 |     def setUp(self):
64 |         # insert credentials and filepaths here to run test
65 |         self.profile_name = "default"
66 |         self.current_directory = os.path.abspath(os.path.dirname(__file__))
67 |         self.image_path = os.path.join(self.current_directory, "fixtures/receipt_no_summary.png")
68 | 
69 |         if self.profile_name is None:
70 |             raise InvalidProfileNameError(
71 |                 "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py."
72 |             )
73 |         if os.environ.get("CALL_TEXTRACT"):
74 |             self.extractor = Textractor(
75 |                 profile_name=self.profile_name, kms_key_id=""
76 |             )
77 | 
78 |     def test_analyze_expense_no_summary_fields(self):
79 |         """Correctly load expense line items where no summary fields were recognized
80 | 
81 |         Per: https://github.com/aws-samples/amazon-textract-textractor/issues/370
82 |         """
83 |         if os.environ.get("CALL_TEXTRACT"):
84 |             document = self.extractor.analyze_expense(file_source=self.image_path)
85 |             with open(get_fixture_path(), "w") as f:
86 |                 json.dump(document.response, f)
87 |         else:
88 |             document = Document.open(get_fixture_path())
89 | 
90 |         self.assertIsInstance(document, Document)
91 |         self.assertEqual(len(document.expense_documents), 1)
92 |         self.assertGreater(len(document.expense_documents[0].line_items_groups), 0)
93 | 
94 | if __name__ == "__main__":
95 |     test = TestTextractorAnalyzeExpense()
96 |     test.setUp()
97 |     test.test_analyze_expense_from_path()


--------------------------------------------------------------------------------
/tests/test_analyze_id.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import PIL
 4 | import unittest
 5 | from tests.utils import get_fixture_path
 6 | from textractor import Textractor
 7 | from textractor.entities.document import Document
 8 | from textractor.data.constants import TextractFeatures, AnalyzeIDFields
 9 | from textractor.exceptions import InvalidProfileNameError, NoImageException, S3FilePathMissing
10 | 
11 | from .utils import save_document_to_fixture_path
12 | 
13 | class TestTextractorAnalyzeID(unittest.TestCase):
14 |     def setUp(self):
15 |         # insert credentials and filepaths here to run test
16 |         self.profile_name = "default"
17 |         self.current_directory = os.path.abspath(os.path.dirname(__file__))
18 |         self.image_path = os.path.join(self.current_directory, "fixtures/fake_id.png")
19 |         self.image = PIL.Image.open(os.path.join(self.current_directory, "fixtures/fake_id.png"))
20 | 
21 |         if self.profile_name is None:
22 |             raise InvalidProfileNameError(
23 |                 "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py."
24 |             )
25 |         if os.environ.get("CALL_TEXTRACT"):
26 |             self.extractor = Textractor(
27 |                 profile_name=self.profile_name, kms_key_id=""
28 |             )
29 | 
30 |     def test_analyze_id_from_path(self):
31 |         # Testing local single image input
32 |         if os.environ.get("CALL_TEXTRACT"):
33 |             document = self.extractor.analyze_id(
34 |                 file_source=self.image_path,
35 |             )
36 |             with open(get_fixture_path(), "w") as f:
37 |                 json.dump(document.response, f)
38 |         else:
39 |             document = Document.open(get_fixture_path())
40 | 
41 |         self.assertIsInstance(document, Document)
42 |         self.assertEqual(len(document.identity_documents), 1)
43 |         self.assertEqual(len(document.identity_documents[0].fields), 21)
44 |         self.assertEqual(document.identity_documents[0].get(AnalyzeIDFields.FIRST_NAME), "GARCIA")
45 |         self.assertEqual(document.identity_documents[0][AnalyzeIDFields.FIRST_NAME], "GARCIA")
46 |     
47 |     def test_analyze_id_from_image(self):
48 |         # Testing local single image input
49 |         if os.environ.get("CALL_TEXTRACT"):
50 |             document = self.extractor.analyze_id(
51 |                 file_source=self.image,
52 |             )
53 |             with open(get_fixture_path(), "w") as f:
54 |                 json.dump(document.response, f)
55 |         else:
56 |             document = Document.open(get_fixture_path())
57 | 
58 |         self.assertIsInstance(document, Document)
59 |         self.assertEqual(len(document.identity_documents), 1)
60 |         self.assertEqual(len(document.identity_documents[0].fields), 21)
61 |         self.assertEqual(document.identity_documents[0].get("FIRST_NAME"), "GARCIA")
62 |         self.assertEqual(document.identity_documents[0]["FIRST_NAME"], "GARCIA")
63 | 
64 | if __name__ == "__main__":
65 |     test = TestTextractorAnalyzeID()
66 |     test.setUp()
67 |     test.test_analyze_id_from_path()


--------------------------------------------------------------------------------
/tests/test_bbox.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy
 3 | from textractor.entities.bbox import BoundingBox
 4 | 
 5 | class TestBoundingBox(unittest.TestCase):
 6 |     def test_bbox(self):
 7 |         dims = {"Width": 3, "Height": 4, "Left": 1, "Top": 2}
 8 |         bbox = BoundingBox.from_normalized_dict(dims, spatial_object=None)
 9 | 
10 |         self.assertTrue(isinstance(bbox.as_denormalized_numpy(), numpy.ndarray))
11 |         self.assertEqual(bbox.__repr__(), "x: 1, y: 2, width: 3, height: 4")
12 | 


--------------------------------------------------------------------------------
/tests/test_layout.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import unittest
 4 | import PIL
 5 | from tests.utils import get_fixture_path
 6 | from textractor import Textractor
 7 | from textractor.entities.document import Document
 8 | from textractor.entities.word import Word
 9 | from textractor.entities.line import Line
10 | from textractor.entities.page import Page
11 | from textractor.entities.table import Table
12 | from textractor.entities.value import Value
13 | from textractor.data.constants import TableFormat
14 | from textractor.entities.key_value import KeyValue
15 | from textractor.visualizers.entitylist import EntityList
16 | from textractor.exceptions import InvalidProfileNameError
17 | from textractor.entities.selection_element import SelectionElement
18 | from textractor.data.constants import TextTypes, SimilarityMetric, TextractFeatures, Direction, DirectionalFinderType
19 | 
20 | from .utils import save_document_to_fixture_path
21 | 
22 | class TestLayout(unittest.TestCase):
23 |     def test_layout(self):
24 |         profile_name = "default"
25 |         current_directory = os.path.abspath(os.path.dirname(__file__))
26 | 
27 |         if profile_name is None:
28 |             raise InvalidProfileNameError(
29 |                 "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py."
30 |             )
31 | 
32 |         if os.environ.get("CALL_TEXTRACT"):
33 |             extractor = Textractor(profile_name=profile_name, kms_key_id="")
34 |             document = extractor.analyze_document(
35 |                 file_source=os.path.join(current_directory, "fixtures/paystub.jpg"),
36 |                 features=[TextractFeatures.LAYOUT, TextractFeatures.TABLES, TextractFeatures.FORMS],
37 |             )
38 |             with open(get_fixture_path(), "w") as f:
39 |                 json.dump(document.response, f)
40 |         else:
41 |             document = Document.open(get_fixture_path())
42 | 
43 |         print(document.text)
44 | 


--------------------------------------------------------------------------------
/tests/test_line.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from textractor.entities.line import Line
 4 | from textractor.entities.word import Word
 5 | from textractor.data.constants import TextTypes
 6 | from textractor.entities.bbox import BoundingBox
 7 | from textractor.visualizers.entitylist import EntityList
 8 | 
 9 | class TestLine(unittest.TestCase):
10 |     def setUp(self):
11 |         self.word_bb_1 = {
12 |             "Width": 0.10809839516878128,
13 |             "Height": 0.01363567914813757,
14 |             "Left": 0.036161474883556366,
15 |             "Top": 0.03439946100115776,
16 |         }
17 |         self.word_bb_2 = {
18 |             "Width": 0.18033172190189362,
19 |             "Height": 0.01742148958146572,
20 |             "Left": 0.22032427787780762,
21 |             "Top": 0.03645794093608856,
22 |         }
23 |         self.word_bb_3 = {
24 |             "Width": 0.03744738921523094,
25 |             "Height": 0.016524378210306168,
26 |             "Left": 0.4087739884853363,
27 |             "Top": 0.0368686243891716,
28 |         }
29 |         self.line_bb = {
30 |             "Width": 0.3,
31 |             "Height": 0.01742148958146572,
32 |             "Left": 0.036161474883556366,
33 |             "Top": 0.03439946100115776,
34 |         }
35 | 
36 |         self.word_1 = Word(
37 |             entity_id="word-id-1",
38 |             bbox=BoundingBox.from_normalized_dict(self.word_bb_1, spatial_object=None),
39 |             text="TEST",
40 |             text_type=TextTypes.PRINTED,
41 |         )
42 |         self.word_2 = Word(
43 |             entity_id="word-id-2",
44 |             bbox=BoundingBox.from_normalized_dict(self.word_bb_2, spatial_object=None),
45 |             text="WORDS",
46 |             text_type=TextTypes.HANDWRITING,
47 |         )
48 |         self.word_3 = Word(
49 |             entity_id="word-id-3",
50 |             bbox=BoundingBox.from_normalized_dict(self.word_bb_3, spatial_object=None),
51 |             text="ADDED",
52 |             text_type=TextTypes.PRINTED,
53 |         )
54 | 
55 |         self.line = Line(
56 |             "line-id",
57 |             BoundingBox.from_normalized_dict(self.line_bb, spatial_object=None),
58 |             [self.word_1, self.word_2, self.word_3],
59 |         )
60 | 
61 | 
62 |     def test_get_words_by_type(self):
63 |         """Test case to filter words of the Line by their type"""
64 |         self.assertEqual(self.line.get_words_by_type(TextTypes.PRINTED), EntityList([self.word_1, self.word_3]))
65 |         self.assertEqual(self.line.get_words_by_type(TextTypes.HANDWRITING), EntityList([self.word_2]))
66 | 
67 | 
68 |     def test_get_text(self):
69 |         """Test case setter for the text attribute"""
70 |         self.assertEqual(self.line.text, "TEST WORDS ADDED")
71 | 
72 | 
73 |     def test_set_page(self):
74 |         """Test case setter for the page attribute"""
75 |         self.line.page = 2
76 |         self.assertEqual(self.line.page, 2)
77 | 
78 | 
79 |     def test_set_page_id(self):
80 |         """Test case setter for the page_id attribute"""
81 |         self.line.page_id = "page-id"
82 |         self.assertEqual(self.line.page_id, "page-id")
83 | 
84 | 
85 |     def test_repr(self):
86 |         """Test case setter for the repr function"""
87 |         self.assertEqual(self.line.__repr__(), "TEST WORDS ADDED")
88 | 


--------------------------------------------------------------------------------
/tests/test_parse_no_fail.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import random
 4 | import PIL
 5 | import unittest
 6 | import boto3
 7 | import uuid
 8 | import logging
 9 | from tests.utils import get_fixture_path
10 | 
11 | from textractor import Textractor
12 | from textractor.data.constants import TextractFeatures
13 | from textractor.entities.document import Document
14 | from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing
15 | from textractor.utils.s3_utils import upload_to_s3, delete_from_s3
16 | 
17 | class TestParseNoFail(unittest.TestCase):
18 |     """The tests below are fuzzing tests and are disabled in the CI test suite.
19 |     They are meant to generate random permutations of the input JSON response and
20 |     ensure that the parser does not raise any exception. Their results may be flaky
21 |     due to the randomness
22 | 
23 |     :param unittest: _description_
24 |     :type unittest: _type_
25 |     """
26 |     def setUp(self):
27 |         # insert credentials and filepaths here to run test
28 |         self.profile_name = "default"
29 |         self.current_directory = os.path.abspath(os.path.dirname(__file__))
30 |         self.saved_api_responses_directory = os.path.join(self.current_directory, "fixtures", "saved_api_responses")
31 |         self.deletion_rate = 0.5
32 | 
33 |     def test_parse_no_fail(self):
34 |         for asset in os.listdir(self.saved_api_responses_directory):
35 |             # Testing that no asset causes the output to contain duplicate words
36 |             with open(os.path.join(self.saved_api_responses_directory, asset)) as f:
37 |                 response = json.load(f)
38 | 
39 |             if not "Blocks" in response:
40 |                 continue
41 | 
42 |             index_to_remove = []
43 |             for i in range(len(response["Blocks"])):
44 |                 if response["Blocks"][i]["BlockType"] != "PAGE" and random.random() <= self.deletion_rate:
45 |                     index_to_remove.append(i)
46 | 
47 |             for i in sorted(index_to_remove, reverse=True):
48 |                 response["Blocks"].pop(i)
49 | 
50 |             document = Document.open(response)
51 |             document.get_text_and_words()
52 | 


--------------------------------------------------------------------------------
/tests/test_queries.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | from tests.utils import get_fixture_path
 4 | from textractor import Textractor
 5 | from textractor.entities.document import Document
 6 | from textractor.exceptions import InputError, InvalidProfileNameError
 7 | from textractor.data.constants import TextractFeatures
 8 | 
 9 | from .utils import save_document_to_fixture_path
10 | 
11 | class QueriesTests(unittest.TestCase):
12 |     def test_queries_as_strings(self):
13 |         profile_name = "default"
14 |         current_directory = os.path.abspath(os.path.dirname(__file__))
15 | 
16 |         if profile_name is None:
17 |             raise InvalidProfileNameError(
18 |                 "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py."
19 |             )
20 | 
21 |         if os.environ.get("CALL_TEXTRACT"):
22 |             extractor = Textractor(profile_name=profile_name, kms_key_id="")
23 |             document = extractor.analyze_document(
24 |                 file_source=os.path.join(current_directory, "fixtures/single-page-1.png"),
25 |                 features=[TextractFeatures.QUERIES],
26 |                 queries=[
27 |                     "What is the name of the package?",
28 |                     "What is the title of the document?",
29 |                 ],
30 |             )
31 |         else:
32 |             document = Document.open(get_fixture_path())
33 | 
34 |         self.assertEqual(len(document.queries), 2)
35 |         self.assertEqual(document.queries[0].result.answer, "Textractor")
36 |         self.assertEqual(document.queries[1].result.answer, "Textractor Test Document")
37 | 
38 |     def test_bad_queries_as_strings(self):
39 |         profile_name = "default"
40 |         current_directory = os.path.abspath(os.path.dirname(__file__))
41 | 
42 |         if profile_name is None:
43 |             raise InvalidProfileNameError(
44 |                 "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py."
45 |             )
46 | 
47 |         if os.environ.get("CALL_TEXTRACT"):
48 |             extractor = Textractor(profile_name=profile_name, kms_key_id="")
49 |             document = extractor.analyze_document(
50 |                 file_source=os.path.join(current_directory, "fixtures/single-page-1.png"),
51 |                 features=[TextractFeatures.QUERIES],
52 |                 queries=[
53 |                     "Lorem ipsum?",
54 |                     "The quick brown fox jumps over the lazy dog?",
55 |                 ],
56 |             )
57 |         else:
58 |             document = Document.open(get_fixture_path())
59 | 
60 |         self.assertEqual(len(document.queries), 2)
61 |         self.assertEqual(document.queries[0].result, None)
62 |         self.assertEqual(document.queries[1].result, None)
63 | 
64 |     @unittest.skipIf(not os.environ.get("CALL_TEXTRACT"), "Asynchronous requests can't be processed without calling Textract")
65 |     def test_query_feature_without_queries(self):
66 |         profile_name = "default"
67 |         current_directory = os.path.abspath(os.path.dirname(__file__))
68 | 
69 |         if profile_name is None:
70 |             raise InvalidProfileNameError(
71 |                 "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py."
72 |             )
73 | 
74 |         extractor = Textractor(profile_name=profile_name, kms_key_id="")
75 |         with self.assertRaises(InputError):
76 |             document = extractor.analyze_document(
77 |                 file_source=os.path.join(current_directory, "fixtures/single-page-1.png"),
78 |                 features=[TextractFeatures.TABLES],
79 |                 queries=[
80 |                     "Lorem ipsum?",
81 |                     "The quick brown fox jumps over the lazy dog?",
82 |                 ],
83 |             )
84 | 


--------------------------------------------------------------------------------
/tests/test_selection_element.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from textractor.data.constants import SelectionStatus, SELECTED
 4 | from textractor.entities.bbox import BoundingBox
 5 | from textractor.entities.selection_element import SelectionElement
 6 | 
 7 | 
 8 | class TestSelectionElement(unittest.TestCase):
 9 |     def setUp(self):
10 |         self.checkbox_bb = {
11 |             "Width": 0.09679746627807617,
12 |             "Height": 0.008036591112613678,
13 |             "Left": 0.08719838410615921,
14 |             "Top": 0.5354593992233276,
15 |         }
16 | 
17 | 
18 |     def test_is_selected(self):
19 |         """Test case to return the selection status of the checkbox"""
20 |         checkbox = SelectionElement(
21 |             entity_id="checkbox-id",
22 |             bbox=BoundingBox.from_normalized_dict(self.checkbox_bb, spatial_object=None),
23 |             status=SelectionStatus.SELECTED,
24 |             confidence=100,
25 |         )
26 |         self.assertTrue(checkbox.is_selected())
27 | 
28 | 
29 |     def test_words(self):
30 |         """Test case to return the words of the checkbox"""
31 |         checkbox = SelectionElement(
32 |             entity_id="checkbox-id",
33 |             bbox=BoundingBox.from_normalized_dict(self.checkbox_bb, spatial_object=None),
34 |             status=SelectionStatus.SELECTED,
35 |             confidence=100,
36 |         )
37 |         self.assertEqual(checkbox.words, [])
38 | 
39 | 
40 |     def test_repr(self):
41 |         """Test case to return the selection status of the checkbox as string"""
42 |         checkbox = SelectionElement(
43 |             entity_id="checkbox-id",
44 |             bbox=BoundingBox.from_normalized_dict(self.checkbox_bb, spatial_object=None),
45 |             status=SelectionStatus.SELECTED,
46 |             confidence=100,
47 |         )
48 |         self.assertEqual(checkbox.__repr__(), "[X]")
49 | 
50 | 
51 |     def test_set_page(self):
52 |         """Test case setter for the page attribute"""
53 |         checkbox = SelectionElement(
54 |             entity_id="checkbox-id",
55 |             bbox=BoundingBox.from_normalized_dict(self.checkbox_bb, spatial_object=None),
56 |             status=SelectionStatus.SELECTED,
57 |             confidence=100,
58 |         )
59 |         checkbox.page = 2
60 |         self.assertEqual(checkbox.page, 2)
61 | 
62 | 
63 |     def test_set_page_id(self):
64 |         """Test case setter for the page_id attribute"""
65 |         checkbox = SelectionElement(
66 |             entity_id="checkbox-id",
67 |             bbox=BoundingBox.from_normalized_dict(self.checkbox_bb, spatial_object=None),
68 |             status=SelectionStatus.SELECTED,
69 |             confidence=100,
70 |         )
71 |         checkbox.page_id = "page-id"
72 |         self.assertEqual(checkbox.page_id, "page-id")
73 | 


--------------------------------------------------------------------------------
/tests/test_signature.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | from tests.utils import get_fixture_path
 4 | from textractor import Textractor
 5 | from textractor.entities.document import Document
 6 | from textractor.exceptions import InvalidProfileNameError
 7 | from textractor.data.constants import TextractFeatures
 8 | 
 9 | from .utils import save_document_to_fixture_path
10 | 
11 | class TestSignature(unittest.TestCase):
12 |     def test_signature(self):
13 |         # Insert credentials here to run test
14 |         profile_name = "default"
15 |         current_directory = os.path.abspath(os.path.dirname(__file__))
16 | 
17 |         if profile_name is None:
18 |             raise InvalidProfileNameError(
19 |                 "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_table.py."
20 |             )
21 | 
22 |         if os.environ.get("CALL_TEXTRACT"):
23 |             extractor = Textractor(
24 |                 profile_name=profile_name, kms_key_id=""
25 |             )
26 |             document = extractor.analyze_document(
27 |                 file_source=os.path.join(current_directory, "fixtures/signature.jpg"),
28 |                 features=[TextractFeatures.SIGNATURES],
29 |                 save_image=False,
30 |             )
31 |             save_document_to_fixture_path(document)
32 |         else:
33 |             document = Document.open(get_fixture_path())
34 | 
35 |         self.assertEqual(len(document.signatures), 1)
36 |         self.assertEqual(len(document.pages[0].signatures), 1)
37 | 


--------------------------------------------------------------------------------
/tests/test_value.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from textractor.data.constants import TextTypes
  4 | from textractor.entities.value import Value
  5 | from textractor.entities.word import Word
  6 | from textractor.entities.bbox import BoundingBox
  7 | from textractor.visualizers.entitylist import EntityList
  8 | 
  9 | class TestValue(unittest.TestCase):
 10 |     def setUp(self):
 11 |         self.word_bb_1 = {
 12 |             "Width": 0.10809839516878128,
 13 |             "Height": 0.01363567914813757,
 14 |             "Left": 0.036161474883556366,
 15 |             "Top": 0.03439946100115776,
 16 |         }
 17 |         self.word_bb_2 = {
 18 |             "Width": 0.18033172190189362,
 19 |             "Height": 0.01742148958146572,
 20 |             "Left": 0.22032427787780762,
 21 |             "Top": 0.03645794093608856,
 22 |         }
 23 |         self.word_bb_3 = {
 24 |             "Width": 0.03744738921523094,
 25 |             "Height": 0.016524378210306168,
 26 |             "Left": 0.4087739884853363,
 27 |             "Top": 0.0368686243891716,
 28 |         }
 29 |         self.value_bb = {
 30 |             "Width": 0.02524515800178051,
 31 |             "Height": 0.01746263913810253,
 32 |             "Left": 0.18779051303863525,
 33 |             "Top": 0.4229613244533539,
 34 |         }
 35 | 
 36 |         self.word_1 = Word(
 37 |             entity_id="word-id-1",
 38 |             bbox=BoundingBox.from_normalized_dict(self.word_bb_1, spatial_object=None),
 39 |             text="TEST",
 40 |             text_type=TextTypes.PRINTED,
 41 |         )
 42 |         self.word_2 = Word(
 43 |             entity_id="word-id-2",
 44 |             bbox=BoundingBox.from_normalized_dict(self.word_bb_2, spatial_object=None),
 45 |             text="WORDS",
 46 |             text_type=TextTypes.HANDWRITING,
 47 |         )
 48 |         self.word_3 = Word(
 49 |             entity_id="word-id-3",
 50 |             bbox=BoundingBox.from_normalized_dict(self.word_bb_3, spatial_object=None),
 51 |             text="ADDED",
 52 |             text_type=TextTypes.PRINTED,
 53 |         )
 54 | 
 55 |         self.value = Value(
 56 |             entity_id="value-id",
 57 |             bbox=BoundingBox.from_normalized_dict(self.value_bb, spatial_object=None),
 58 |         )
 59 |         self.word_objs = [self.word_1, self.word_2, self.word_3]
 60 |         self.value.words = self.word_objs
 61 |         self.value.key_id = "key-id"
 62 |         self.value.contains_checkbox = False
 63 |         self.value.page = 2
 64 |         self.value.page_id = "page-id"
 65 | 
 66 | 
 67 |     def test_words(self):
 68 |         """Test case to add words to the Value field of a key-value pair"""
 69 |         self.assertEqual(self.value.words, EntityList(self.word_objs))
 70 |     
 71 |     
 72 |     def test_key_id(self):
 73 |         """Test case to access Key ID of a key-value pair"""
 74 |         self.assertEqual(self.value.key_id, "key-id")
 75 |     
 76 |     
 77 |     def test_contains_checkbox(self):
 78 |         self.assertFalse(self.value.contains_checkbox)
 79 |     
 80 | 
 81 |     def test_set_page(self):
 82 |         """Test case setter for the page attribute"""
 83 |         self.assertEqual(self.value.page, 2)
 84 |     
 85 |     
 86 |     def test_set_page_id(self):
 87 |         """Test case setter for the page_id attribute"""
 88 |         self.assertEqual(self.value.page_id, "page-id")
 89 |     
 90 |     
 91 |     def test_get_words_by_type(self):
 92 |         """Test case to retrieve words of a specific type in the Value field of a key-value pair"""
 93 |         self.assertEqual(
 94 |             self.value.get_words_by_type(text_type=TextTypes.PRINTED),
 95 |             EntityList([self.word_1, self.word_3])
 96 |         )
 97 |         self.assertEqual(
 98 |             self.value.get_words_by_type(text_type=TextTypes.HANDWRITING),
 99 |             EntityList([self.word_2])
100 |         )
101 | 
102 | 
103 |     def test_repr(self):
104 |         """Test case to retrieve words of the Value field in a key-value pair as text"""
105 |         self.assertEqual(self.value.__repr__(), "TEST WORDS ADDED")
106 | 


--------------------------------------------------------------------------------
/tests/test_visualizer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import PIL
 3 | import unittest
 4 | import boto3
 5 | import uuid
 6 | from tests.utils import get_fixture_path
 7 | 
 8 | from textractor import Textractor
 9 | from textractor.entities.document import Document
10 | from textractor.entities.lazy_document import LazyDocument
11 | from textractor.data.constants import TextractFeatures
12 | from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing
13 | 
14 | 
15 | class TestTextractor(unittest.TestCase):
16 |     def setUp(self):
17 |         # insert credentials and filepaths here to run test
18 |         if os.environ.get("CALL_TEXTRACT"):
19 |             self.profile_name = "default"
20 |             self.current_directory = os.path.abspath(os.path.dirname(__file__))
21 |             self.extractor = Textractor(
22 |                 profile_name=self.profile_name, kms_key_id=""
23 |             )
24 | 
25 |     @unittest.skipIf(not os.environ.get("CALL_TEXTRACT"), "This test only work with CALL_TEXTRACT enabled")
26 |     def test_detect_document_text(self):
27 |         # Testing local single image input
28 |         document = self.extractor.detect_document_text(
29 |             file_source=os.path.join(self.current_directory, "fixtures/single-page-1.png"),
30 |         )
31 | 
32 |         out1 = document.words.visualize()
33 |         out2 = document.words.visualize(with_text=False)
34 |         out3 = (document.words + document.lines).visualize()
35 | 
36 |     @unittest.skipIf(not os.environ.get("CALL_TEXTRACT"), "This test only work with CALL_TEXTRACT enabled")
37 |     def test_textractor_analyze_document(self):
38 |         # Testing analyze_document() with local single image input
39 |         document = self.extractor.analyze_document(
40 |             file_source=os.path.join(self.current_directory, "fixtures/amzn_q2.png"),
41 |             features=[TextractFeatures.TABLES, TextractFeatures.FORMS],
42 |         )
43 | 
44 |         out1 = document.tables.visualize()
45 |         out2 = document.tables[0].visualize(with_text=False)
46 |         out3 = document.pages[0].visualize()


--------------------------------------------------------------------------------
/tests/test_word.py:
--------------------------------------------------------------------------------
 1 | """Tests for all Word class methods."""
 2 | 
 3 | import unittest
 4 | 
 5 | from textractor.entities.bbox import BoundingBox
 6 | from textractor.data.constants import TextTypes
 7 | from textractor.entities.word import Word
 8 | 
 9 | class TestWord(unittest.TestCase):
10 |     def setUp(self):
11 |         self.bounding_box = {
12 |             "Width": 0.10809839516878128,
13 |             "Height": 0.01363567914813757,
14 |             "Left": 0.036161474883556366,
15 |             "Top": 0.03439946100115776,
16 |         }
17 | 
18 | 
19 |     def test_set_text(self):
20 |         """Test case setter for the text attribute"""
21 |         word = Word(
22 |             entity_id="word-id",
23 |             bbox=BoundingBox.from_normalized_dict(self.bounding_box, spatial_object=None),
24 |         )
25 |         word.text = "word-test"
26 |         self.assertEqual(word.text, "word-test")
27 | 
28 | 
29 |     def test_set_text_type(self):
30 |         """Test case setter for the text type attribute"""
31 |         word = Word(
32 |             entity_id="word-id",
33 |             bbox=BoundingBox.from_normalized_dict(self.bounding_box, spatial_object=None),
34 |         )
35 |         word.text_type = TextTypes.HANDWRITING
36 |         self.assertEqual(word.text_type, TextTypes.HANDWRITING)
37 | 
38 | 
39 |     def test_set_page(self):
40 |         """Test case setter for the page attribute"""
41 |         word = Word(
42 |             entity_id="word-id",
43 |             bbox=BoundingBox.from_normalized_dict(self.bounding_box, spatial_object=None),
44 |         )
45 |         word.page = 2
46 |         self.assertEqual(word.page, 2)
47 | 
48 | 
49 |     def test_set_page_id(self):
50 |         """Test case setter for the page_id attribute"""
51 |         word = Word(
52 |             entity_id="word-id",
53 |             bbox=BoundingBox.from_normalized_dict(self.bounding_box, spatial_object=None),
54 |         )
55 |         word.page_id = "page-id"
56 |         self.assertEqual(word.page_id, "page-id")
57 | 
58 | 
59 |     def test_repr(self):
60 |         """Test case setter for the repr function"""
61 |         word = Word(
62 |             entity_id="word-id",
63 |             bbox=BoundingBox.from_normalized_dict(self.bounding_box, spatial_object=None),
64 |         )
65 |         word.text = "word-test"
66 |         self.assertEqual(word.__repr__(), "word-test")
67 | 


--------------------------------------------------------------------------------
/tests/test_word_ordering.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import os
 3 | import PIL
 4 | import unittest
 5 | import json
 6 | from tests.utils import get_fixture_path
 7 | 
 8 | from textractor import Textractor
 9 | from textractor.entities.document import Document
10 | from textractor.entities.lazy_document import LazyDocument
11 | from textractor.visualizers.entitylist import EntityList
12 | from textractor.data.constants import TextractFeatures
13 | from textractor.exceptions import InvalidProfileNameError, S3FilePathMissing
14 | from textractor.utils.s3_utils import upload_to_s3, delete_from_s3
15 | 
16 | class TestWordOrdering(unittest.TestCase):
17 |     def setUp(self):
18 |         # insert credentials and filepaths here to run test
19 |         self.profile_name = "default"
20 |         self.bucket_name = os.environ.get("S3_BUCKET", "textractor-tests")
21 |         if os.environ.get("CALL_TEXTRACT"):
22 |             self.s3_client = boto3.session.Session(
23 |                 profile_name=self.profile_name
24 |             ).client("s3", region_name="us-west-2")
25 | 
26 |             if self.profile_name is None:
27 |                 raise InvalidProfileNameError(
28 |                     "Textractor could not be initialized. Populate profile_name with a valid input in tests/test_textractor.py."
29 |                 )
30 |             self.current_directory = os.path.abspath(os.path.dirname(__file__))
31 |             self.extractor = Textractor(
32 |                 profile_name=self.profile_name, kms_key_id=""
33 |             )
34 | 
35 |     def test_word_ordering_in_cell(self):
36 |         if os.environ.get("CALL_TEXTRACT"):
37 |             document = self.extractor.analyze_document(
38 |                 file_source=os.path.join(self.current_directory, "fixtures/reading_order.pdf"),
39 |                 features=[TextractFeatures.TABLES]
40 |             )
41 |             with open(get_fixture_path(), "w") as fh:
42 |                 json.dump(document.response, fh)
43 |         else:
44 |             document = Document.open(get_fixture_path())
45 | 
46 |         self.assertEqual(document.tables[0].table_cells[0].text.strip(), "Are those Words in order?")
47 | 
48 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import json
 3 | import os
 4 | 
 5 | def get_fixture_path():
 6 |     """Uses reflection to get correct saved response file
 7 | 
 8 |     :return: Path to the saved response file for the calling function
 9 |     :rtype: str
10 |     """
11 |     return os.path.join(
12 |         os.path.abspath(os.path.dirname(__file__)),
13 |         f"fixtures/saved_api_responses/{inspect.currentframe().f_back.f_code.co_name}.json"
14 |     )
15 | 
16 | def save_document_to_fixture_path(document):
17 |     with open(
18 |         os.path.join(
19 |             os.path.abspath(os.path.dirname(__file__)),
20 |             f"fixtures/saved_api_responses/{inspect.currentframe().f_back.f_code.co_name}.json"
21 |         ),
22 |         "w"
23 |     ) as f:
24 |         json.dump(document.response, f)


--------------------------------------------------------------------------------
/textractor/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.9.2"
2 | 
3 | from .textractor import Textractor
4 | 


--------------------------------------------------------------------------------
/textractor/cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/cli/__init__.py


--------------------------------------------------------------------------------
/textractor/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/data/__init__.py


--------------------------------------------------------------------------------
/textractor/data/html_linearization_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | from textractor.data.text_linearization_config import TextLinearizationConfig
 5 | 
 6 | @dataclass
 7 | class HTMLLinearizationConfig(TextLinearizationConfig):
 8 |     """
 9 |     This :class:`HTMLLinearizationConfig` is a convenience configuration for converting a Document or DocumentEntity to HTML.
10 |     For a description of the parameters see :class:`TextLinearizationConfig`.
11 |     """
12 | 
13 |     title_prefix: str = "<h1>"
14 | 
15 |     title_suffix: str = "</h1>"
16 | 
17 |     header_prefix: str = "<h1>"
18 |     
19 |     header_suffix: str = "</h1>"
20 |     
21 |     section_header_prefix: str = "<h2>"
22 | 
23 |     section_header_suffix: str = "</h2>"
24 | 
25 |     text_prefix: str = "<p>"
26 | 
27 |     text_suffix: str = "</p>"
28 | 
29 |     entity_layout_prefix: str = "<p>"
30 | 
31 |     entity_layout_suffix: str = "</p>"
32 | 
33 |     table_prefix: str = "<table>"
34 | 
35 |     table_suffix: str = "</table>"
36 | 
37 |     table_row_prefix: str = "<tr>"
38 | 
39 |     table_row_suffix: str = "</tr>"
40 | 
41 |     table_cell_header_prefix: str = "<th>"
42 | 
43 |     table_cell_header_suffix: str = "</th>"
44 | 
45 |     table_cell_prefix: str = "<td>"
46 | 
47 |     table_cell_suffix: str = "</td>"
48 | 
49 |     table_column_separator: str = ""
50 | 
51 |     table_linearization_format: str = "html"
52 |     
53 |     table_add_title_as_caption: bool = True
54 |     
55 |     table_add_footer_as_paragraph: bool = True
56 | 
57 |     table_column_separator: str = ""
58 | 
59 |     list_layout_prefix: str = "<div>"
60 |     
61 |     list_layout_suffix: str = "</div>"
62 |     
63 |     table_layout_prefix: str = "<div>"
64 | 
65 |     table_layout_suffix: str = "</div>"
66 |     
67 |     key_value_layout_prefix: str = "<div>"
68 | 
69 |     key_value_layout_suffix: str = "</div>"
70 |     
71 |     figure_layout_prefix: str = "<div>"
72 |      
73 |     figure_layout_suffix: str = "</div>"
74 | 
75 |     footer_layout_prefix: str = "<div>"
76 |      
77 |     footer_layout_suffix: str = "</div>"
78 | 
79 |     page_num_prefix: str = "<div>"
80 |     
81 |     page_num_suffix: str = "</div>"
82 |     
83 |     add_ids_to_html_tags: bool = False #: Adds Textract block id to the HTML markup. Only supported for HTML.
84 |     
85 |     add_short_ids_to_html_tags: bool = False #: Adds the truncated (first 8 characters) Textract block id to the HTML markup. Only supported for HTML
86 | 


--------------------------------------------------------------------------------
/textractor/data/markdown_linearization_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | from textractor.data.text_linearization_config import TextLinearizationConfig
 5 | 
 6 | @dataclass
 7 | class MarkdownLinearizationConfig(TextLinearizationConfig):
 8 |     """
 9 |     This :class:`MarkdownLinearizationConfig` is a convenience configuration for converting a Document or DocumentEntity to Markdown.
10 |     For a description of the parameters see :class:`TextLinearizationConfig`.
11 |     """
12 | 
13 |     title_prefix: str = "# "
14 | 
15 |     table_linearization_format: str = "markdown"
16 | 
17 |     section_header_prefix: str = "## "
18 | 
19 |     table_remove_column_headers: bool = True


--------------------------------------------------------------------------------
/textractor/entities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/entities/__init__.py


--------------------------------------------------------------------------------
/textractor/entities/identity_document.py:
--------------------------------------------------------------------------------
 1 | """The IdentityDocument class is the object representation of an AnalyzeID response. It is similar to a dictionary. Despite its name it does not inherit from Document as the AnalyzeID response does not contains position information."""
 2 | 
 3 | import os
 4 | from typing import List, Dict, Union
 5 | from textractor.data.constants import AnalyzeIDFields
 6 | from textractor.entities.bbox import SpatialObject
 7 | from textractor.entities.identity_field import IdentityField
 8 | 
 9 | from textractor.exceptions import InputError
10 | 
11 | 
12 | class IdentityDocument(SpatialObject):
13 |     """
14 |     Represents the description of a single ID document.
15 |     """
16 | 
17 |     def __init__(self, fields=None):
18 |         """
19 |         Creates a new document, ideally containing entity objects pertaining to each page.
20 | 
21 |         :param num_pages: Number of pages in the input Document.
22 |         """
23 |         super().__init__(width=0, height=0)
24 |         self._fields = IdentityDocument._fields_to_dict(fields)
25 | 
26 |     @classmethod
27 |     def _fields_to_dict(cls, fields: Union[List[IdentityField], Dict[str, dict]]):
28 |         if not fields:
29 |             return {}
30 |         elif isinstance(fields, list) and isinstance(fields[0], IdentityField):
31 |             return {id_field.key: id_field for id_field in fields}
32 |         elif isinstance(fields, dict):
33 |             field_dict = {}
34 |             for id_field in fields.values():
35 |                 field_dict[id_field["key"]] = IdentityField(
36 |                     id_field["key"],
37 |                     id_field["value"],
38 |                     id_field["confidence"],
39 |                 )
40 |             return field_dict
41 |         else:
42 |             raise InputError(
43 |                 f"fields needs to be a list of IdentityFields or a list of dictionaries, not {type(fields)}"
44 |             )
45 | 
46 |     @property
47 |     def fields(self) -> Dict[str, IdentityField]:
48 |         return self._fields
49 | 
50 |     @fields.setter
51 |     def fields(self, fields):
52 |         self._fields = fields
53 | 
54 |     def keys(self) -> List[str]:
55 |         keys = [key for key in self._fields.keys()]
56 |         return keys
57 | 
58 |     def values(self) -> List[str]:
59 |         values = [field.value for field in self._fields.values()]
60 |         return values
61 | 
62 |     def __getitem__(self, key: Union[str, AnalyzeIDFields]) -> str:
63 |         return self._fields[key if isinstance(key, str) else key.value].value
64 | 
65 |     def get(self, key: Union[str, AnalyzeIDFields]) -> Union[str, None]:
66 |         result = self._fields.get(key if isinstance(key, str) else key.value)
67 |         if result is None:
68 |             return None
69 |         return result.value
70 | 
71 |     def __repr__(self):
72 |         return os.linesep.join([f"{str(k)}: {str(v)}" for k, v in self.fields.items()])
73 | 


--------------------------------------------------------------------------------
/textractor/entities/identity_field.py:
--------------------------------------------------------------------------------
 1 | class IdentityField:
 2 |     def __init__(self, key, value, confidence):
 3 |         self._key = key
 4 |         self._value = value
 5 |         self._confidence = confidence
 6 | 
 7 |     @property
 8 |     def key(self) -> str:
 9 |         return self._key
10 | 
11 |     @property
12 |     def value(self) -> str:
13 |         return self._value
14 | 
15 |     @property
16 |     def confidence(self) -> float:
17 |         return self._confidence
18 | 
19 |     def __repr__(self) -> str:
20 |         return self.value
21 | 


--------------------------------------------------------------------------------
/textractor/entities/line.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Represents a single :class:`Line` Entity within the :class:`Document`. 
  3 | The Textract API response returns groups of words as LINE BlockTypes. They contain :class:`Word` entities as children. 
  4 | 
  5 | This class contains the associated metadata with the :class:`Line` entity including the entity ID, 
  6 | bounding box information, child words, page number, Page ID and confidence of detection.
  7 | """
  8 | 
  9 | import logging
 10 | from typing import List
 11 | 
 12 | from textractor.entities.word import Word
 13 | from textractor.data.constants import TextTypes
 14 | from textractor.entities.bbox import BoundingBox
 15 | from textractor.exceptions import InputError
 16 | from textractor.entities.document_entity import DocumentEntity
 17 | from textractor.visualizers.entitylist import EntityList
 18 | from textractor.utils.html_utils import escape_text
 19 | from textractor.data.text_linearization_config import TextLinearizationConfig
 20 | 
 21 | class Line(DocumentEntity):
 22 |     """
 23 |     To create a new :class:`Line` object we need the following:
 24 | 
 25 |     :param entity_id: Unique identifier of the Line entity.
 26 |     :type entity_id: str
 27 |     :param bbox: Bounding box of the line entity.
 28 |     :type bbox: BoundingBox
 29 |     :param words: List of the Word entities present in the line
 30 |     :type words: list, optional
 31 |     :param confidence: confidence with which the entity was detected.
 32 |     :type confidence: float, optional
 33 |     """
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         entity_id: str,
 38 |         bbox: BoundingBox,
 39 |         words: List[Word] = None,
 40 |         confidence: float = 0,
 41 |     ):
 42 |         super().__init__(entity_id, bbox)
 43 |         if words is not None and len(words) > 0:
 44 |             self._children: List[Word] = sorted(words, key=lambda x: (x.bbox.x, x.bbox.y))
 45 |         else:
 46 |             self._children = []
 47 | 
 48 |         self._confidence = confidence / 100
 49 |         self._page = None
 50 |         self._page_id = None
 51 | 
 52 |     @property
 53 |     def text(self):
 54 |         """
 55 |         :return: Returns the text transcription of the :class:`Line` entity.
 56 |         :rtype: str
 57 |         """
 58 |         return " ".join([word.text for word in self.words])
 59 | 
 60 |     @property
 61 |     def words(self):
 62 |         """
 63 |         :return: Returns the line's children
 64 |         :rtype: List[Word]
 65 |         """
 66 |         return self._children
 67 | 
 68 |     def get_text_and_words(self, config: TextLinearizationConfig = TextLinearizationConfig()):
 69 |         if not self.bbox:
 70 |             self.bbox = BoundingBox.enclosing_bbox(self.words)
 71 |         for w in self.words:
 72 |             w.line_id = self.id
 73 |             w.line_bbox = self.bbox
 74 |         return escape_text(self.text, config), self.words
 75 | 
 76 |     @property
 77 |     def page(self):
 78 |         """
 79 |         :return: Returns the page number of the page the :class:`Line` entity is present in.
 80 |         :rtype: int
 81 |         """
 82 |         return self._page
 83 | 
 84 |     @page.setter
 85 |     def page(self, page_num: int):
 86 |         """
 87 |         Sets the page number attribute of the Line entity.
 88 | 
 89 |         :param page_num: Page number where the Line entity exists.
 90 |         :type page_num: int
 91 |         """
 92 |         self._page = page_num
 93 | 
 94 |     @property
 95 |     def page_id(self) -> str:
 96 |         """
 97 |         :return: Returns the Page ID attribute of the page which the entity belongs to.
 98 |         :rtype: str
 99 |         """
100 |         return self._page_id
101 | 
102 |     @page_id.setter
103 |     def page_id(self, page_id: str):
104 |         """
105 |         Sets the Page ID of the :class:`Line` entity.
106 | 
107 |         :param page_id: Page ID of the page the entity belongs to.
108 |         :type page_id: str
109 |         """
110 |         self._page_id = page_id
111 | 
112 |     def get_words_by_type(self, text_type: TextTypes = TextTypes.PRINTED) -> List[Word]:
113 |         """
114 |         :param text_type: TextTypes.PRINTED or TextTypes.HANDWRITING
115 |         :type text_type: TextTypes
116 |         :return: Returns EntityList of Word entities that match the input text type.
117 |         :rtype: EntityList[Word]
118 |         """
119 |         if not isinstance(text_type, TextTypes):
120 |             raise InputError(
121 |                 "text_type parameter should be of TextTypes type. Find input choices from textractor.data.constants"
122 |             )
123 | 
124 |         if not self.words:
125 |             return []
126 |         return EntityList([word for word in self.words if word.text_type == text_type])
127 | 
128 |     def __repr__(self):
129 |         """
130 |         :return: String representation of the Line entity.
131 |         :rtype: str
132 |         """
133 |         return " ".join([word.text for word in self.words])
134 | 


--------------------------------------------------------------------------------
/textractor/entities/linearizable.py:
--------------------------------------------------------------------------------
 1 | """
 2 | :class:`Linearizable` is a class that defines how a component can be linearized (converted to text)
 3 | """
 4 | 
 5 | from abc import ABC, abstractmethod
 6 | from typing import Dict, List, Tuple
 7 | 
 8 | from textractor.data.text_linearization_config import TextLinearizationConfig
 9 | from textractor.data.html_linearization_config import HTMLLinearizationConfig
10 | from textractor.data.markdown_linearization_config import MarkdownLinearizationConfig
11 | 
12 | class Linearizable(ABC):    
13 |     def get_text(
14 |         self, config: TextLinearizationConfig = TextLinearizationConfig()
15 |     ) -> str:
16 |         """
17 |         Returns the linearized text of the entity
18 | 
19 |         :param config: Text linearization confi 
20 |         :type config:   
21 |         :return: Linearized text of the entity
22 |         :rtype: str
23 |         """
24 |         text, _ = self.get_text_and_words(config=config)
25 |         return text
26 | 
27 |     @property
28 |     def text(self) -> str:
29 |         """
30 |         Maps to .get_text()
31 | 
32 |         :return: Returns the linearized text of the entity
33 |         :rtype: str
34 |         """
35 |         return self.get_text()
36 | 
37 |     def to_html(
38 |         self,
39 |         config: HTMLLinearizationConfig = HTMLLinearizationConfig()
40 |     ) -> str:
41 |         """
42 |         Returns the HTML representation of the entity
43 | 
44 |         :return: HTML text of the entity
45 |         :rtype: str
46 |         """
47 |         return self.get_text(config)
48 | 
49 |     def to_markdown(
50 |         self,
51 |         config: MarkdownLinearizationConfig = MarkdownLinearizationConfig()
52 |     ) -> str:
53 |         """
54 |         Returns the markdown representation of the entity
55 | 
56 |         :return: Markdown text of the entity
57 |         :rtype: str
58 |         """
59 |         return self.get_text(config)
60 | 
61 |     @abstractmethod
62 |     def get_text_and_words(
63 |         self, config: TextLinearizationConfig = TextLinearizationConfig()
64 |     ) -> Tuple[str, List]:
65 |         """
66 |         Used for linearization, returns the linearized text of the entity and the matching words
67 | 
68 |         :return: Tuple of text and word list
69 |         :rtype: Tuple[str, List[Word]]
70 |         """
71 |         pass
72 | 


--------------------------------------------------------------------------------
/textractor/entities/page_layout.py:
--------------------------------------------------------------------------------
  1 | from textractor.entities.layout import Layout
  2 | from textractor.visualizers.entitylist import EntityList
  3 | 
  4 | 
  5 | class PageLayout:
  6 |     """
  7 |     Object representation of the layout components detected in the table.
  8 |     """
  9 | 
 10 |     def __init__(
 11 |         self,
 12 |         titles: EntityList[Layout] = EntityList([]),
 13 |         headers: EntityList[Layout] = EntityList([]),
 14 |         footers: EntityList[Layout] = EntityList([]),
 15 |         section_headers: EntityList[Layout] = EntityList([]),
 16 |         page_numbers: EntityList[Layout] = EntityList([]),
 17 |         lists: EntityList[Layout] = EntityList([]),
 18 |         figures: EntityList[Layout] = EntityList([]),
 19 |         tables: EntityList[Layout] = EntityList([]),
 20 |         key_values: EntityList[Layout] = EntityList([]),
 21 |     ):
 22 |         self._titles = titles
 23 |         self._headers = headers
 24 |         self._footers = footers
 25 |         self._section_headers = section_headers
 26 |         self._page_numbers = page_numbers
 27 |         self._lists = lists
 28 |         self._figures = figures
 29 |         self._tables = tables
 30 |         self._key_values = key_values
 31 | 
 32 |     @property
 33 |     def titles(self) -> EntityList[Layout]:
 34 |         """Titles detected in the Page
 35 | 
 36 |         :return: EntityList of titles detected in the page
 37 |         :rtype: EntityList[Layout]
 38 |         """
 39 |         return self._titles
 40 | 
 41 |     @property
 42 |     def headers(self) -> EntityList[Layout]:
 43 |         """Headers detected in the Page
 44 | 
 45 |         :return: EntityList of headers detected in the page
 46 |         :rtype: EntityList[Layout]
 47 |         """
 48 |         return self._headers
 49 | 
 50 |     @property
 51 |     def footers(self) -> EntityList[Layout]:
 52 |         """Footers detected in the Page
 53 | 
 54 |         :return: EntityList of footers detected in the page
 55 |         :rtype: EntityList[Layout]
 56 |         """
 57 |         return self._footers
 58 | 
 59 |     @property
 60 |     def section_headers(self) -> EntityList[Layout]:
 61 |         """Section headers detected in the Page
 62 | 
 63 |         :return: EntityList of section headers detected in the page
 64 |         :rtype: EntityList[Layout]
 65 |         """
 66 |         return self._section_headers
 67 | 
 68 |     @property
 69 |     def page_numbers(self) -> EntityList[Layout]:
 70 |         """Page numbers detected in the Page
 71 | 
 72 |         :return: EntityList of page numbers detected in the page
 73 |         :rtype: EntityList[Layout]
 74 |         """
 75 |         return self._page_numbers
 76 | 
 77 |     @property
 78 |     def lists(self) -> EntityList[Layout]:
 79 |         """Lists detected in the Page
 80 | 
 81 |         :return: EntityList of lists detected in the page
 82 |         :rtype: EntityList[Layout]
 83 |         """
 84 |         return self._lists
 85 | 
 86 |     @property
 87 |     def figures(self) -> EntityList[Layout]:
 88 |         """Figures detected in the Page
 89 | 
 90 |         :return: EntityList of figures detected in the page
 91 |         :rtype: EntityList[Layout]
 92 |         """
 93 |         return self._figures
 94 | 
 95 |     @property
 96 |     def tables(self) -> EntityList[Layout]:
 97 |         """Tables detected in the Page. This includes Tables detected by the AnalyzeDocument Tables API if used.
 98 | 
 99 |         :return: EntityList of tables detected in the page
100 |         :rtype: EntityList[Layout]
101 |         """
102 |         return self._tables
103 | 
104 |     @property
105 |     def key_values(self) -> EntityList[Layout]:
106 |         """KeyValues detected in the Page
107 | 
108 |         :return: EntityList of keyvalues detected in the page
109 |         :rtype: EntityList[Layout]
110 |         """
111 |         return self._key_values
112 | 


--------------------------------------------------------------------------------
/textractor/entities/query.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The :class:`KeyValue` entity is a document entity representing the Forms output. The key in :class:`KeyValue` are typically words 
  3 | and the :class:`Value` could be :class:`Word` elements or :class:`SelectionElement` in case of checkboxes.
  4 | 
  5 | This class contains the associated metadata with the :class:`KeyValue` entity including the entity ID, 
  6 | bounding box information, value, existence of checkbox, page number, Page ID and confidence of detection.
  7 | """
  8 | 
  9 | from typing import List, Optional, Tuple
 10 | from textractor.data.text_linearization_config import TextLinearizationConfig
 11 | 
 12 | from textractor.entities.query_result import QueryResult
 13 | from textractor.entities.bbox import BoundingBox
 14 | from textractor.entities.document_entity import DocumentEntity
 15 | 
 16 | 
 17 | class Query(DocumentEntity):
 18 |     """
 19 |     The Query object merges QUERY and QUERY_RESULT blocks.
 20 |     To create a new :class:`Query` object we require the following:
 21 | 
 22 |     :param entity_id: Unique identifier of the Query entity.
 23 |     :type entity_id: str
 24 |     :param bbox: Bounding box of the KeyValue entity.
 25 |     :type bbox: BoundingBox
 26 |     :param contains_checkbox: True/False to indicate if the value is a checkbox.
 27 |     :type contains_checkbox: bool
 28 |     :param value: Value object that maps to the KeyValue entity.
 29 |     :type value: Value
 30 |     :param confidence: confidence with which the entity was detected.
 31 |     :type confidence: float
 32 |     """
 33 | 
 34 |     def __init__(
 35 |         self,
 36 |         entity_id: str,
 37 |         query: str,
 38 |         alias: str,
 39 |         query_result: Optional[QueryResult],
 40 |         result_bbox: Optional[BoundingBox],
 41 |     ):
 42 |         super().__init__(entity_id, result_bbox)
 43 | 
 44 |         self.query = query
 45 |         self.alias = alias
 46 |         self.result = query_result
 47 |         self._page = None
 48 |         self._page_id = None
 49 | 
 50 |     @property
 51 |     def page(self) -> int:
 52 |         """
 53 |         :return: Returns the page number of the page the :class:`Table` entity is present in.
 54 |         :rtype: int
 55 |         """
 56 |         return self._page
 57 | 
 58 |     @page.setter
 59 |     def page(self, page_num: int):
 60 |         """
 61 |         Sets the page number attribute of the :class:`Table` entity.
 62 | 
 63 |         :param page_num: Page number where the Table entity exists.
 64 |         :type page_num: int
 65 |         """
 66 |         self._page = page_num
 67 | 
 68 |     @property
 69 |     def page_id(self) -> str:
 70 |         """
 71 |         :return: Returns the Page ID attribute of the page which the entity belongs to.
 72 |         :rtype: str
 73 |         """
 74 |         return self._page_id
 75 | 
 76 |     @page_id.setter
 77 |     def page_id(self, page_id: str):
 78 |         """
 79 |         Sets the Page ID of the :class:`Table` entity.
 80 | 
 81 |         :param page_id: Page ID of the page the entity belongs to.
 82 |         :type page_id: str
 83 |         """
 84 |         self._page_id = page_id
 85 | 
 86 |     @property
 87 |     def has_result(self) -> bool:
 88 |         """
 89 |         :return: Returns whether there was a result associated with the query
 90 |         :rtype: bool
 91 |         """
 92 |         return self.result is not None
 93 | 
 94 |     def __repr__(self) -> str:
 95 |         """
 96 |         :return: Returns Query object as a formatted string.
 97 |         :rtype: str
 98 |         """
 99 | 
100 |         if self.result:
101 |             return f"{self.query} {self.result.answer}"
102 |         else:
103 |             return f"{self.query}"
104 | 
105 |     def get_text_and_words(
106 |         self, config: TextLinearizationConfig = TextLinearizationConfig()
107 |     ) -> Tuple[str, List]:
108 |         """
109 |         Used for linearization, returns the linearized text of the Query and the matching words
110 | 
111 |         :return: Tuple of text and word list
112 |         :rtype: Tuple[str, List[Word]]
113 |         """
114 |         return f"{self.query} {self.result.answer}", []
115 | 


--------------------------------------------------------------------------------
/textractor/entities/query_result.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The :class:`KeyValue` entity is a document entity representing the Forms output. The key in :class:`KeyValue` are typically words 
  3 | and the :class:`Value` could be :class:`Word` elements or :class:`SelectionElement` in case of checkboxes.
  4 | 
  5 | This class contains the associated metadata with the :class:`KeyValue` entity including the entity ID, 
  6 | bounding box information, value, existence of checkbox, page number, Page ID and confidence of detection.
  7 | """
  8 | 
  9 | from typing import List, Tuple
 10 | from textractor.data.text_linearization_config import TextLinearizationConfig
 11 | from textractor.entities.bbox import BoundingBox
 12 | from textractor.entities.document_entity import DocumentEntity
 13 | 
 14 | 
 15 | class QueryResult(DocumentEntity):
 16 |     """
 17 |     The QueryResult object represents QUERY_RESULT blocks.
 18 |     To create a new :class:`QueryResult` object we require the following:
 19 | 
 20 |     :param entity_id: Unique identifier of the Query entity.
 21 |     :type entity_id: str
 22 |     :param bbox: Bounding box of the QueryResult entity.
 23 |     :type bbox: BoundingBox
 24 |     :param contains_checkbox: True/False to indicate if the value is a checkbox.
 25 |     :type contains_checkbox: bool
 26 |     :param value: Value object that maps to the QueryResult entity.
 27 |     :type value: Value
 28 |     :param confidence: confidence with which the entity was detected.
 29 |     :type confidence: float
 30 |     """
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         entity_id: str,
 35 |         confidence: float,
 36 |         result_bbox: BoundingBox,
 37 |         answer: str,
 38 |     ):
 39 |         super().__init__(entity_id, result_bbox)
 40 | 
 41 |         self.answer = answer
 42 |         self._confidence = confidence / 100
 43 |         self._page = None
 44 |         self._page_id = None
 45 | 
 46 |     @property
 47 |     def page(self) -> int:
 48 |         """
 49 |         :return: Returns the page number of the page the :class:`Table` entity is present in.
 50 |         :rtype: int
 51 |         """
 52 |         return self._page
 53 | 
 54 |     @page.setter
 55 |     def page(self, page_num: int):
 56 |         """
 57 |         Sets the page number attribute of the :class:`Table` entity.
 58 | 
 59 |         :param page_num: Page number where the Table entity exists.
 60 |         :type page_num: int
 61 |         """
 62 |         self._page = page_num
 63 | 
 64 |     @property
 65 |     def page_id(self) -> str:
 66 |         """
 67 |         :return: Returns the Page ID attribute of the page which the entity belongs to.
 68 |         :rtype: str
 69 |         """
 70 |         return self._page_id
 71 | 
 72 |     @page_id.setter
 73 |     def page_id(self, page_id: str):
 74 |         """
 75 |         Sets the Page ID of the :class:`Table` entity.
 76 | 
 77 |         :param page_id: Page ID of the page the entity belongs to.
 78 |         :type page_id: str
 79 |         """
 80 |         self._page_id = page_id
 81 | 
 82 |     def __repr__(self) -> str:
 83 |         """
 84 |         :return: Returns Query object as a formatted string.
 85 |         :rtype: str
 86 |         """
 87 | 
 88 |         return f"{self.answer}"
 89 | 
 90 |     def get_text_and_words(
 91 |         self, config: TextLinearizationConfig = TextLinearizationConfig()
 92 |     ) -> Tuple[str, List]:
 93 |         """
 94 |         Used for linearization, returns the linearized text of the QueryResult and the matching words
 95 | 
 96 |         :return: Tuple of text and word list
 97 |         :rtype: Tuple[str, List[Word]]
 98 |         """
 99 |         return self.answer, []
100 | 


--------------------------------------------------------------------------------
/textractor/entities/selection_element.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Represents a single :class:`SelectionElement`/Checkbox/Clickable Entity within the :class:`Document`.
  3 | 
  4 | This class contains the associated metadata with the :class:`SelectionElement` entity including the entity ID, 
  5 | bounding box information, selection status, page number, Page ID and confidence of detection.
  6 | """
  7 | 
  8 | import uuid
  9 | 
 10 | from typing import List
 11 | from textractor.data.text_linearization_config import TextLinearizationConfig
 12 | from textractor.entities.word import Word
 13 | from textractor.entities.line import Line
 14 | from textractor.entities.bbox import BoundingBox
 15 | from textractor.data.constants import SELECTED, NOT_SELECTED, SelectionStatus
 16 | from textractor.entities.document_entity import DocumentEntity
 17 | 
 18 | 
 19 | class SelectionElement(DocumentEntity):
 20 |     """
 21 |     To create a new :class:`SelectionElement` object we need the following:
 22 | 
 23 |     :param entity_id: Unique identifier of the SelectionElement entity.
 24 |     :type entity_id: str
 25 |     :param bbox: Bounding box of the SelectionElement
 26 |     :type bbox: BoundingBox
 27 |     :param status: SelectionStatus.SELECTED / SelectionStatus.NOT_SELECTED
 28 |     :type status: SelectionStatus
 29 |     :param confidence: Confidence with which this entity is detected.
 30 |     :type confidence: float
 31 |     """
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         entity_id: str,
 36 |         bbox: BoundingBox,
 37 |         status: SelectionStatus,
 38 |         confidence: float = 0,
 39 |     ):
 40 |         super().__init__(entity_id, bbox)
 41 |         self.key_id = None
 42 |         self.value_id = None
 43 |         self.status = status
 44 |         self._confidence = confidence / 100
 45 |         self._page = None
 46 |         self._page_id = None
 47 | 
 48 |     def is_selected(self) -> bool:
 49 |         """
 50 |         :return: Returns True / False depending on selection status of the SelectionElement.
 51 |         :rtype: bool
 52 |         """
 53 |         return self.status == SelectionStatus.SELECTED
 54 | 
 55 |     @property
 56 |     def words(self) -> List[Word]:
 57 |         """
 58 |         :return: Empty Word list as SelectionElement do not have words
 59 |         :rtype: EntityList[Word]
 60 |         """
 61 |         return []
 62 | 
 63 |     def get_text_and_words(
 64 |         self, config: TextLinearizationConfig = TextLinearizationConfig()
 65 |     ):
 66 |         w = Word(
 67 |             entity_id=str(uuid.uuid4()),
 68 |             bbox=self.bbox,
 69 |             text=config.selection_element_selected
 70 |             if self.status == SelectionStatus.SELECTED
 71 |             else config.selection_element_not_selected,
 72 |         )
 73 |         w.is_clickable = True
 74 |         w.line = Line(entity_id=str(uuid.uuid4()), bbox=self.bbox, words=[w])
 75 | 
 76 |         words = [w]
 77 | 
 78 |         text = w.text
 79 |             
 80 |         for w in words:
 81 |             w.value_id = str(self.id)
 82 |             w.value_bbox = self.bbox
 83 |         return text, words
 84 | 
 85 |     @property
 86 |     def page(self):
 87 |         """
 88 |         :return: Returns the page number of the page the SelectionElement entity is present in.
 89 |         :rtype: int
 90 |         """
 91 |         return self._page
 92 | 
 93 |     @page.setter
 94 |     def page(self, page_num: int):
 95 |         """
 96 |         Sets the page number attribute of the SelectionElement entity.
 97 | 
 98 |         :param page_num: Page number where the SelectionElement entity exists.
 99 |         :type page_num: int
100 |         """
101 |         self._page = page_num
102 | 
103 |     @property
104 |     def page_id(self) -> str:
105 |         """
106 |         :return: Returns the Page ID attribute of the page which the entity belongs to.
107 |         :rtype: str
108 |         """
109 |         return self._page_id
110 | 
111 |     @page_id.setter
112 |     def page_id(self, page_id: str):
113 |         """
114 |         Sets the Page ID of the SelectionElement entity.
115 | 
116 |         :param page_id: Page ID of the page the entity belongs to.
117 |         :type page_id: str
118 |         """
119 |         self._page_id = page_id
120 | 
121 |     def __repr__(self) -> str:
122 |         """
123 |         Returns string representation of SelectionElement.
124 |         """
125 |         if self.status == SelectionStatus.SELECTED:
126 |             return "[X]"
127 |         else:
128 |             return "[ ]"
129 | 


--------------------------------------------------------------------------------
/textractor/entities/signature.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Represents a single :class:`Signature` Entity within the :class:`Document`. 
 3 | The Textract API response returns signatures as SIGNATURE BlockTypes.
 4 | 
 5 | This class contains the associated metadata with the :class:`Signature` entity including the entity ID, 
 6 | bounding box information, page number, Page ID and confidence of detection.
 7 | """
 8 | 
 9 | import uuid
10 | from textractor.data.text_linearization_config import TextLinearizationConfig
11 | 
12 | from textractor.entities.bbox import BoundingBox
13 | from textractor.entities.document_entity import DocumentEntity
14 | from textractor.entities.line import Line
15 | from textractor.entities.word import Word
16 | 
17 | 
18 | class Signature(DocumentEntity):
19 |     """
20 |     To create a new :class:`Signature` object we need the following:
21 | 
22 |     :param entity_id: Unique identifier of the signature entity.
23 |     :type entity_id: str
24 |     :param bbox: Bounding box of the signature entity.
25 |     :type bbox: BoundingBox
26 |     :param words: List of the Word entities present in the signature
27 |     :type words: list, optional
28 |     :param confidence: confidence with which the entity was detected.
29 |     :type confidence: float, optional
30 |     """
31 | 
32 |     def __init__(
33 |         self,
34 |         entity_id: str,
35 |         bbox: BoundingBox,
36 |         confidence: float = 0,
37 |     ):
38 |         super().__init__(entity_id, bbox)
39 |         self._confidence = confidence / 100
40 |         self._page = None
41 |         self._page_id = None
42 | 
43 |     @property
44 |     def page(self):
45 |         """
46 |         :return: Returns the page number of the page the :class:`Signature` entity is present in.
47 |         :rtype: int
48 |         """
49 |         return self._page
50 | 
51 |     @property
52 |     def words(self):
53 |         """
54 |         :return: Returns an empty list
55 |         :rtype: list
56 |         """
57 |         return []
58 | 
59 |     @page.setter
60 |     def page(self, page_num: int):
61 |         """
62 |         Sets the page number attribute of the :class:`Signature` entity.
63 | 
64 |         :param page_num: Page number where the :class:`Signature` entity exists.
65 |         :type page_num: int
66 |         """
67 |         self._page = page_num
68 | 
69 |     @property
70 |     def page_id(self) -> str:
71 |         """
72 |         :return: Returns the Page ID attribute of the page which the entity belongs to.
73 |         :rtype: str
74 |         """
75 |         return self._page_id
76 | 
77 |     @page_id.setter
78 |     def page_id(self, page_id: str):
79 |         """
80 |         Sets the Page ID of the :class:`Signature` entity.
81 | 
82 |         :param page_id: Page ID of the page the entity belongs to.
83 |         :type page_id: str
84 |         """
85 |         self._page_id = page_id
86 | 
87 |     def get_text_and_words(
88 |         self, config: TextLinearizationConfig = TextLinearizationConfig()
89 |     ):
90 |         w = Word(
91 |             entity_id=str(uuid.uuid4()), bbox=self.bbox, text=config.signature_token
92 |         )
93 |         w.line = Line(entity_id=str(uuid.uuid4()), bbox=self.bbox, words=[w])
94 |         return config.signature_token, [w]
95 | 


--------------------------------------------------------------------------------
/textractor/entities/table_footer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Represents a single :class:`TableFooter:class:` object. The `TableCell:class:` object contains information such as:
  3 | 
  4 | * The position of the footer within the Document
  5 | * The words that it contains
  6 | * Confidence of entity detection
  7 | """
  8 | 
  9 | from typing import List
 10 | from textractor.data.text_linearization_config import TextLinearizationConfig
 11 | from textractor.entities.bbox import BoundingBox
 12 | from textractor.entities.document_entity import DocumentEntity
 13 | from textractor.entities.word import Word
 14 | from textractor.visualizers.entitylist import EntityList
 15 | 
 16 | 
 17 | class TableFooter(DocumentEntity):
 18 |     """
 19 |     Represents a footer that is either in-table or floating
 20 |     """
 21 | 
 22 |     def __init__(
 23 |         self,
 24 |         entity_id: str,
 25 |         bbox: BoundingBox,
 26 |     ):
 27 |         super().__init__(entity_id, bbox)
 28 |         self._words: List[Word] = []
 29 |         self._is_floating: bool = False
 30 |         self._page = None
 31 |         self._page_id = None
 32 | 
 33 |     @property
 34 |     def words(self):
 35 |         """
 36 |         Returns all the Word objects present in the :class:`TableFooter`.
 37 | 
 38 |         :return words: List of Word objects, each representing a word within the TableFooter.
 39 |         :rtype: list
 40 |         """
 41 |         return EntityList(self._words)
 42 | 
 43 |     @words.setter
 44 |     def words(self, words: List[Word]):
 45 |         """
 46 |         Add Word objects to the :class:`TableFooter`.
 47 | 
 48 |         :param words: List of Word objects, each representing a word within the TableFooter. No specific ordering is assumed as it is ordered internally.
 49 |         :type words: list
 50 |         """
 51 |         self._words = words
 52 | 
 53 |     @property
 54 |     def text(self) -> str:
 55 |         """Returns the text in the footer as one space-separated string
 56 | 
 57 |         :return: Text in the footer
 58 |         :rtype: str
 59 |         """
 60 |         return " ".join([w.text for w in self.words])
 61 | 
 62 |     @property
 63 |     def page(self):
 64 |         """
 65 |         :return: Returns the page number of the page the TableFooter entity is present in.
 66 |         :rtype: int
 67 |         """
 68 | 
 69 |         return self._page
 70 | 
 71 |     @page.setter
 72 |     def page(self, page_num: int):
 73 |         """
 74 |         Sets the page number attribute of the TableFooter entity.
 75 | 
 76 |         :param page_num: Page number where the TableFooter entity exists.
 77 |         :type page_num: int
 78 |         """
 79 | 
 80 |         self._page = page_num
 81 | 
 82 |     @property
 83 |     def page_id(self) -> str:
 84 |         """
 85 |         :return: Returns the Page ID attribute of the page which the entity belongs to.
 86 |         :rtype: str
 87 |         """
 88 | 
 89 |         return self._page_id
 90 | 
 91 |     @page_id.setter
 92 |     def page_id(self, page_id: str):
 93 |         """
 94 |         Sets the Page ID of the TableFooter entity.
 95 | 
 96 |         :param page_id: Page ID of the page the entity belongs to.
 97 |         :type page_id: str
 98 |         """
 99 | 
100 |         self._page_id = page_id
101 | 
102 |     def get_text_and_words(
103 |         self, config: TextLinearizationConfig = TextLinearizationConfig()
104 |     ):
105 |         return " ".join(self.words), self.words
106 | 


--------------------------------------------------------------------------------
/textractor/entities/table_title.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Represents a single :class:`TableTitle:class:` object. The `TableCell:class:` object contains information such as:
  3 | 
  4 | * The position of the title within the Document
  5 | * The words that it contains
  6 | * Confidence of entity detection
  7 | """
  8 | 
  9 | from typing import List
 10 | from textractor.data.text_linearization_config import TextLinearizationConfig
 11 | from textractor.entities.bbox import BoundingBox
 12 | from textractor.entities.document_entity import DocumentEntity
 13 | from textractor.entities.word import Word
 14 | from textractor.utils.text_utils import linearize_children
 15 | from textractor.visualizers.entitylist import EntityList
 16 | 
 17 | 
 18 | class TableTitle(DocumentEntity):
 19 |     """
 20 |     Represents a title that is either in-table or floating
 21 |     """
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         entity_id: str,
 26 |         bbox: BoundingBox,
 27 |     ):
 28 |         super().__init__(entity_id, bbox)
 29 |         self._words: List[Word] = []
 30 |         self._is_floating: bool = False
 31 |         self._page = None
 32 |         self._page_id = None
 33 | 
 34 |     @property
 35 |     def words(self):
 36 |         """
 37 |         Returns all the Word objects present in the :class:`TableTitle`.
 38 | 
 39 |         :return words: List of Word objects, each representing a word within the TableTitle.
 40 |         :rtype: list
 41 |         """
 42 |         return EntityList(self._words)
 43 | 
 44 |     @words.setter
 45 |     def words(self, words: List[Word]):
 46 |         """
 47 |         Add Word objects to the :class:`TableTitle`.
 48 | 
 49 |         :param words: List of Word objects, each representing a word within the TableTitle. No specific ordering is assumed as it is ordered internally.
 50 |         :type words: list
 51 |         """
 52 |         self._words = words
 53 | 
 54 |     @property
 55 |     def text(self) -> str:
 56 |         """Returns the text in the title as one space-separated string
 57 | 
 58 |         :return: Text in the title
 59 |         :rtype: str
 60 |         """
 61 |         return " ".join([w.text for w in self.words])
 62 | 
 63 |     @property
 64 |     def page(self):
 65 |         """
 66 |         :return: Returns the page number of the page the TableTitle entity is present in.
 67 |         :rtype: int
 68 |         """
 69 | 
 70 |         return self._page
 71 | 
 72 |     @page.setter
 73 |     def page(self, page_num: int):
 74 |         """
 75 |         Sets the page number attribute of the TableTitle entity.
 76 | 
 77 |         :param page_num: Page number where the TableTitle entity exists.
 78 |         :type page_num: int
 79 |         """
 80 | 
 81 |         self._page = page_num
 82 | 
 83 |     @property
 84 |     def page_id(self) -> str:
 85 |         """
 86 |         :return: Returns the Page ID attribute of the page which the entity belongs to.
 87 |         :rtype: str
 88 |         """
 89 | 
 90 |         return self._page_id
 91 | 
 92 |     @page_id.setter
 93 |     def page_id(self, page_id: str):
 94 |         """
 95 |         Sets the Page ID of the TableTitle entity.
 96 | 
 97 |         :param page_id: Page ID of the page the entity belongs to.
 98 |         :type page_id: str
 99 |         """
100 | 
101 |         self._page_id = page_id
102 | 
103 |     @property
104 |     def is_floating(self) -> bool:
105 |         """
106 |         :return: Returns whether the TableTitle entity is floating or not.
107 |         :rtype: bool
108 |         """
109 | 
110 |         return self._is_floating
111 | 
112 |     @is_floating.setter
113 |     def is_floating(self, is_floating: bool):
114 |         """
115 |         Sets the is_floating attribute of the TableTitle entity.
116 | 
117 |         :param is_floating: Whether the title is floating (not in-table) or not (in-table).
118 |         :type is_floating: bool
119 |         """
120 | 
121 |         self._is_floating = is_floating
122 | 
123 |     def get_text_and_words(
124 |         self, config: TextLinearizationConfig = TextLinearizationConfig()
125 |     ):
126 |         text, words = linearize_children(self.words, config=config)
127 |         return text, words
128 | 


--------------------------------------------------------------------------------
/textractor/exceptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Define exceptions specific to textractor.
 3 | """
 4 | 
 5 | 
 6 | class RegionMismatchError(Exception):
 7 |     """Raised when region on the profile_name does not match the region of the S3 bucket being accessed."""
 8 | 
 9 |     pass
10 | 
11 | 
12 | class NoImageException(Exception):
13 |     """Raised when visualize() method is called without saving image during Textract API call."""
14 | 
15 |     pass
16 | 
17 | 
18 | class InputError(Exception):
19 |     """Raised when function inputs are incorrect."""
20 | 
21 | 
22 | class EntityListCreationError(Exception):
23 |     """Raised when EntityList is created without passing any object or list of objects."""
24 | 
25 |     pass
26 | 
27 | 
28 | class InvalidProfileNameError(Exception):
29 |     """Raised when profile_name passed to Textractor is invalid."""
30 | 
31 |     pass
32 | 
33 | 
34 | class S3FilePathMissing(Exception):
35 |     """Raised when s3 file path is missing."""
36 | 
37 |     pass
38 | 
39 | 
40 | class MissingDependencyException(Exception):
41 |     """Raised when a dependency is missing for a specific code path"""
42 | 
43 |     pass
44 | 
45 | 
46 | class IncorrectMethodException(Exception):
47 |     """Raised when wrong endpoint is called."""
48 | 
49 |     pass
50 | 
51 | 
52 | class UnhandledCaseException(Exception):
53 |     """Raised when no statement matched the condition"""
54 | 
55 |     pass
56 | 
57 | class UnsupportedDocumentException(Exception):
58 |     """Raised by the Textract API when the document could not be processed"""
59 | 
60 |     pass
61 | 
62 | class InvalidS3ObjectException(Exception):
63 |     """Raised by the Textract API when an S3 object could not be accessed"""
64 |     
65 |     pass
66 | 


--------------------------------------------------------------------------------
/textractor/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/parsers/__init__.py


--------------------------------------------------------------------------------
/textractor/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/utils/__init__.py


--------------------------------------------------------------------------------
/textractor/utils/geometry_util.py:
--------------------------------------------------------------------------------
 1 | import statistics
 2 | from typing import List
 3 | from copy import deepcopy
 4 | from collections.abc import Iterable
 5 | 
 6 | 
 7 | def flatten(list_of_lists):
 8 |     """
 9 |     Utility function to flatten a list of lists recursively.
10 | 
11 |     :param list_of_lists: List containing any depth of lists recursively to be flattened into a single list.
12 |     :type list_of_lists: list
13 |     :return: Flattened list of input list
14 |     :rtype: list
15 |     """
16 |     for x in list_of_lists:
17 |         if isinstance(x, Iterable):
18 |             yield from flatten(x)
19 |         else:
20 |             yield x
21 | 
22 | 
23 | def get_indices(numpy_indexing: str = ":", max_val=10) -> List[int]:
24 |     """
25 |     Function to convert numpy indexing format to list of indices to access cells within the Table.
26 | 
27 |     :param numpy_indexing: string containing start:stop:step format
28 |     :param max_val: maximum rows or columns on the table depending on input.
29 |     :return: Returns the indices of table rows and columns following the numpy indexing format.
30 |     :rtype: list
31 |     """
32 |     indices = []
33 |     assert isinstance(numpy_indexing, str)
34 |     assert ":" in numpy_indexing or numpy_indexing.isdigit()
35 | 
36 |     if numpy_indexing == ":":
37 |         numpy_indexing = "None:None"
38 |     if numpy_indexing == "None":
39 |         numpy_indexing = "None:None"
40 | 
41 |     if ":" not in numpy_indexing:
42 |         if int(numpy_indexing) > max_val:
43 |             raise IndexError()
44 |         return [int(numpy_indexing)]
45 | 
46 |     if numpy_indexing == "None:None:None":
47 |         indices = list(range(0, max_val))
48 | 
49 |     else:
50 |         return_indices = numpy_indexing.split(":")
51 |         assert len(return_indices) > 1
52 | 
53 |         start = (
54 |             int(return_indices[0])
55 |             if return_indices[0] != "" and return_indices[0] != "None"
56 |             else 0
57 |         )
58 | 
59 |         if start < 0:
60 |             start = max_val + start
61 | 
62 |         end = (
63 |             int(return_indices[1])
64 |             if return_indices[1] != "" and return_indices[1] != "None"
65 |             else max_val
66 |         )
67 | 
68 |         if end < 0:
69 |             end = max_val + end
70 | 
71 |         index_range = list(range(start, end))
72 | 
73 |         if len(return_indices) == 3:
74 |             step = (
75 |                 int(return_indices[2])
76 |                 if return_indices[2] != "" and return_indices[2] != "None"
77 |                 else 1
78 |             )
79 |             indices += [i for i in index_range if index_range.index(i) % step == 0]
80 |         else:
81 |             indices = index_range
82 | 
83 |     return list(set(indices))
84 | 
85 | 
86 | def sort_by_position(entities: List) -> List:
87 |     return sorted(entities, key=lambda e: (e.bbox.y + e.bbox.height, e.bbox.x))
88 | 


--------------------------------------------------------------------------------
/textractor/utils/html_utils.py:
--------------------------------------------------------------------------------
 1 | import html
 2 | from textractor.data.html_linearization_config import HTMLLinearizationConfig
 3 | 
 4 | def add_id_to_html_tag(prefix, id, config):
 5 |     if not isinstance(config, HTMLLinearizationConfig) or not prefix:
 6 |         return prefix
 7 |     if config.add_ids_to_html_tags:
 8 |         return prefix[:-1] + f' id="{id}"' + prefix[-1]
 9 |     elif config.add_short_ids_to_html_tags:
10 |         return prefix[:-1] + f' id="{id[:8]}"' + prefix[-1]
11 |     else:
12 |         return prefix
13 | 
14 | def escape_text(text, config):
15 |     if not isinstance(config, HTMLLinearizationConfig):
16 |         return text
17 |     else:
18 |         return html.escape(text)
19 | 


--------------------------------------------------------------------------------
/textractor/utils/legacy_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from textractor.data.constants import (
 3 |     LAYOUT_FIGURE,
 4 |     LAYOUT_LIST,
 5 |     LAYOUT_TABLE,
 6 |     LAYOUT_KEY_VALUE,
 7 |     LAYOUT_TEXT,
 8 |     LAYOUT_TITLE,
 9 |     LAYOUT_HEADER,
10 |     LAYOUT_FOOTER,
11 |     LAYOUT_SECTION_HEADER,
12 |     LAYOUT_PAGE_NUMBER,
13 | )
14 | 
15 | logger = logging.getLogger(__name__)
16 | 
17 | def converter(response):
18 |     blocks_to_delete = []
19 |     page_blocks = []
20 |     try:
21 |         for i, block in enumerate(response["Blocks"]):
22 |             if block.get("BlockType") == "PAGE":
23 |                 page_blocks.append(block)
24 |             elif block.get("BlockType", "").startswith("LAYOUT_FIGURE_"):
25 |                 block["BlockType"] = LAYOUT_TEXT
26 |             elif (
27 |                 block.get("BlockType", "").startswith("LAYOUT_") and
28 |                 block.get("BlockType") not in [
29 |                     LAYOUT_TEXT,
30 |                     LAYOUT_TITLE,
31 |                     LAYOUT_HEADER,
32 |                     LAYOUT_FOOTER,
33 |                     LAYOUT_SECTION_HEADER,
34 |                     LAYOUT_PAGE_NUMBER,
35 |                     LAYOUT_LIST,
36 |                     LAYOUT_FIGURE,
37 |                     LAYOUT_TABLE,
38 |                     LAYOUT_KEY_VALUE,
39 |                 ]
40 |             ):
41 |                 block["BlockType"] = LAYOUT_FIGURE
42 |             elif block.get("BlockType") == LAYOUT_FIGURE and "CONTAINER" in block.get("EntityTypes", []):
43 |                 blocks_to_delete.append((i, block))
44 |         
45 |         blocks_to_delete_id_set = set([b["Id"] for _, b in blocks_to_delete])
46 |         for page_block in page_blocks:
47 |             for relationship in page_block.get("Relationships", []):
48 |                 if relationship["Type"] == "CHILD":
49 |                     relationship["Ids"] = [
50 |                         id
51 |                         for id in relationship["Ids"]
52 |                         if id not in blocks_to_delete_id_set
53 |                     ]
54 |                     break
55 |             
56 |         for i, block in blocks_to_delete[::-1]:
57 |             del response["Blocks"][i]
58 |     except Exception as ex:
59 |         logger.warning(f"Failed to convert the response for backward compatibility. {str(ex)}")
60 |     
61 |     return response
62 | 


--------------------------------------------------------------------------------
/textractor/utils/pdf_utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import List, Union
 3 | from PIL import Image
 4 | 
 5 | try:
 6 |     import pypdfium2
 7 |     PYPDFIUM2_IS_INSTALLED = True
 8 | except ImportError:
 9 |     PYPDFIUM2_IS_INSTALLED = False
10 | 
11 | try:
12 |     from pdf2image import convert_from_bytes, convert_from_path, pdfinfo_from_bytes, pdfinfo_from_path
13 |     PDF2IMAGE_IS_INSTALLED = True
14 | except ImportError:
15 |     PDF2IMAGE_IS_INSTALLED = False
16 |     
17 | 
18 | def rasterize_pdf(pdf: Union[str, bytes]) -> List[Image.Image]:
19 |     """
20 |     Convert a pdf into a list of images
21 |     """
22 |     if PYPDFIUM2_IS_INSTALLED:
23 |         pdf = pypdfium2.PdfDocument(pdf)
24 |         return [page.render(scale=250 / 72).to_pil() for page in pdf]
25 |     elif PDF2IMAGE_IS_INSTALLED:
26 |         if isinstance(pdf, str):
27 |             return convert_from_path(pdf, dpi=250, fmt="jpeg")
28 |         elif isinstance(pdf, bytes):
29 |             return convert_from_bytes(pdf, dpi=250, fmt="jpeg")
30 |         else:
31 |             raise Exception(f"{type(pdf)} is not a supported type, should be str or bytes")
32 |     else:
33 |         raise Exception("PDF rasterization is not possible if neither pypdfium2 nor pdf2image are installed")


--------------------------------------------------------------------------------
/textractor/utils/results_utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import boto3
 3 | import os
 4 | import json
 5 | import datetime
 6 | from textractcaller.t_call import get_s3_output_config_keys, OutputConfig, remove_none
 7 | 
 8 | 
 9 | def results_exist(job_id: str, s3_bucket: str, s3_prefix: str, s3_client=None) -> bool:
10 |     if not s3_client:
11 |         s3_client = boto3.client("s3")
12 |     response = s3_client.list_objects(
13 |         Bucket=s3_bucket,
14 |         Prefix=os.path.join(s3_prefix, job_id + "/"),
15 |         Delimiter="/",
16 |         MaxKeys=2,
17 |     )
18 |     # The directory will have at least one file because of the S3 access check
19 |     return "Contents" in response and len(response["Contents"]) > 1
20 | 
21 | def get_full_json_from_output_config(
22 |     output_config: OutputConfig, job_id: str, s3_client=None
23 | ) -> dict:
24 |     if not output_config or not job_id:
25 |         raise ValueError("no output_config or job_id")
26 |     if not output_config.s3_bucket or not output_config.s3_prefix:
27 |         raise ValueError("no output_config or job_id")
28 |     if not s3_client:
29 |         s3_client = boto3.client("s3")
30 | 
31 |     result_value = dict()
32 |     last_result = None
33 |     parsed_keys = set()
34 |     while last_result is None or (datetime.datetime.now().astimezone() - last_result).total_seconds() < 5:
35 |         keys = get_s3_output_config_keys(
36 |             output_config=output_config, job_id=job_id, s3_client=s3_client
37 |         )
38 |         for key in keys:
39 |             if key in parsed_keys:
40 |                 continue
41 |             parsed_keys.add(key)
42 |             s3_object = s3_client.get_object(Bucket=output_config.s3_bucket, Key=key)
43 |             if last_result is None:
44 |                 last_result = s3_object["LastModified"]
45 |             else:
46 |                 last_result = max(last_result, s3_object["LastModified"])
47 |             body = s3_object["Body"]
48 |             body_read = body.read()
49 |             body_decode = body_read.decode("utf-8")
50 |             response = dict(json.loads(body_decode))
51 |             if "Blocks" in result_value:
52 |                 result_value["Blocks"].extend(response["Blocks"])
53 |             else:
54 |                 result_value = response
55 |     result_value = remove_none(result_value)
56 |     return result_value
57 | 


--------------------------------------------------------------------------------
/textractor/utils/s3_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple
 2 | from io import BytesIO
 3 | from PIL import Image
 4 | 
 5 | from textractor.exceptions import InputError
 6 | 
 7 | 
 8 | def s3_path_to_bucket_and_prefix(s3_path: str) -> Tuple[str, str]:
 9 |     """Converts an S3 URI to a bucket and prefix
10 | 
11 |     :param s3_path: S3 URI
12 |     :type s3_path: str
13 |     :raises InputError: Raised if the given path cannot be parsed
14 |     :return: Tuple of bucket and prefix as string
15 |     :rtype: Tuple[str, str]
16 |     """
17 |     try:
18 |         bucket, prefix = s3_path.replace("s3://", "").split("/", 1)
19 |     except IndexError:
20 |         raise InputError(f"Could not parse {s3_path} as ")
21 |     return bucket, prefix
22 | 
23 | 
24 | def download_from_s3(client, s3_path: str, **extra_args):
25 |     """Downloads a file from S3 and returns it as a BytesIO object
26 | 
27 |     :param client: S3 client
28 |     :type client: Client
29 |     :param s3_path: S3 path to download
30 |     :type s3_path: str
31 |     :return: S3 file as a BytesIO object
32 |     :rtype: BytesIO
33 |     """
34 | 
35 |     bucket, prefix = s3_path_to_bucket_and_prefix(s3_path)
36 | 
37 |     f = BytesIO()
38 |     client.download_fileobj(bucket, prefix, f)
39 |     f.seek(0)
40 |     return f
41 | 
42 | 
43 | def upload_to_s3(
44 |     client, s3_path: str, file_source: Union[str, bytes, Image.Image], **extra_args
45 | ):
46 |     """Upload a file to S3
47 | 
48 |     :param client: boto3 client
49 |     :type client: Client
50 |     :param s3_path: S3 path to upload to
51 |     :type s3_path: str
52 |     :param file_source: File to upload
53 |     :type file_source: Union[str, bytes, Image.Image]
54 |     :raises InputError: Raised if the file_source is not of type str, bytes or Image
55 |     """
56 |     bucket, prefix = s3_path_to_bucket_and_prefix(s3_path)
57 |     if isinstance(file_source, Image.Image):
58 |         fake_file = BytesIO()
59 |         file_source.save(fake_file, format="PNG")
60 |         fake_file.seek(0)
61 |         client.upload_fileobj(fake_file, bucket, prefix, extra_args)
62 |     elif isinstance(file_source, bytes):
63 |         fake_file = BytesIO(file_source)
64 |         client.upload_fileobj(fake_file, bucket, prefix, extra_args)
65 |     elif isinstance(file_source, str):
66 |         client.upload_file(file_source, bucket, prefix, extra_args)
67 |     else:
68 |         raise InputError(
69 |             f"{file_source} must be of type str or bytes, not {type(file_source)}"
70 |         )
71 | 
72 | 
73 | def delete_from_s3(client, s3_path: str):
74 |     """Delete a file from S3
75 | 
76 |     :param client: boto3 client
77 |     :type client: Client
78 |     :param s3_path: S3 path to the object to delete
79 |     :type s3_path: str
80 |     """
81 |     bucket, prefix = s3_path_to_bucket_and_prefix(s3_path)
82 | 
83 |     client.delete_object(bucket, prefix)
84 | 


--------------------------------------------------------------------------------
/textractor/utils/search_utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for Document search"""
  2 | 
  3 | try:
  4 |     import numpy as np
  5 | except ImportError:
  6 |     # No need to log it here as numpy is only used if SentenceTransformers is used
  7 |     # The latter has numpy as dependency.
  8 |     pass
  9 | 
 10 | import math
 11 | import editdistance
 12 | from textractor.data.constants import SimilarityMetric
 13 | from textractor.exceptions import MissingDependencyException
 14 | 
 15 | 
 16 | from textractor.data.constants import (
 17 |     IS_COLUMN_HEAD,
 18 |     IS_FOOTER_CELL,
 19 |     IS_TITLE_CELL,
 20 |     IS_SUMMARY_CELL,
 21 |     IS_SECTION_TITLE_CELL,
 22 |     CellTypes,
 23 | )
 24 | 
 25 | 
 26 | class SearchUtils:
 27 |     model = None
 28 |     util = None
 29 |     model_string = "all-MiniLM-L6-v2"
 30 | 
 31 |     @classmethod
 32 |     def get_word_similarity(
 33 |         cls, word_1: str, word_2: str, similarity_metric: SimilarityMetric
 34 |     ) -> float:
 35 |         """
 36 |         Returns the extent of similarity between the input words using the similarity_metric input by the user.
 37 | 
 38 |         :param word_1: First word to check for similarity
 39 |         :type word_1: str
 40 |         :param word_2: Second word to check for similarity
 41 |         :type word_2: str
 42 |         :param similarity_metric: The function supports one of 3 metrics \
 43 |                                 * Levenshtein distance/ edit distance \
 44 |                                 * Euclidean distance \
 45 |                                 * Cosine distance
 46 |         :type similarity_metric: str
 47 | 
 48 |         :return: Returns the similarity measure calculated based on the metric for the 2 input words.
 49 |         :rtype: float
 50 |         """
 51 |         if cls.model is None and similarity_metric != SimilarityMetric.LEVENSHTEIN:
 52 |             try:
 53 |                 from sentence_transformers import SentenceTransformer, util
 54 |             except ImportError:
 55 |                 raise MissingDependencyException(
 56 |                     "sentence_transformers is not installed. Use SimilarityMetric.LEVENSHTEIN."
 57 |                 )
 58 |             cls.model = SentenceTransformer(cls.model_string)
 59 |             cls.util = util
 60 | 
 61 |         if similarity_metric == SimilarityMetric.LEVENSHTEIN:
 62 |             return normalized_edit_distance(word_1.lower(), word_2.lower())
 63 |         elif similarity_metric == SimilarityMetric.EUCLIDEAN:
 64 |             ref_word_emb = cls.model.encode([word_1])
 65 |             word_emb = cls.model.encode([word_2])
 66 |             dist = np.linalg.norm(ref_word_emb - word_emb)
 67 |             return dist
 68 |         else:
 69 |             ref_word_emb = cls.model.encode([word_1])
 70 |             word_emb = cls.model.encode([word_2])
 71 |             similarity = cls.util.cos_sim(ref_word_emb, word_emb)
 72 |             return similarity.item()
 73 | 
 74 | 
 75 | def jaccard_similarity(list_1: list, list_2: list) -> float:
 76 |     """
 77 |     Calculates Jaccard similarity between the 2 input lists.
 78 | 
 79 |     :param list_1: First list to check for similarity
 80 |     :type list_1: list
 81 |     :param list_2: Second list to check for similarity
 82 |     :type list_2: list
 83 | 
 84 |     :return: Returns the similarity measure calculated for the 2 input lists.
 85 |     :rtype: float
 86 |     """
 87 | 
 88 |     set_1 = set(list_1)
 89 |     set_2 = set(list_2)
 90 |     return float(len(set_1.intersection(set_2)) / len(set_1.union(set_2)))
 91 | 
 92 | 
 93 | def get_metadata_attr_name(cell_atr):
 94 |     """
 95 |     Returns metadata attribute mapping to the input CellType.
 96 | 
 97 |     :param cell_atr: Input cell type
 98 |     :type: enum
 99 |     :return: Returns metadata attribute mapping to the input CellType.
100 |     :rtype: str
101 |     """
102 |     cell_map = {
103 |         CellTypes.COLUMN_HEADER: IS_COLUMN_HEAD,
104 |         CellTypes.SECTION_TITLE: IS_SECTION_TITLE_CELL,
105 |         CellTypes.SUMMARY_CELL: IS_SUMMARY_CELL,
106 |         CellTypes.FLOATING_TITLE: IS_TITLE_CELL,
107 |         CellTypes.FLOATING_FOOTER: IS_FOOTER_CELL,
108 |     }
109 |     try:
110 |         return cell_map[cell_atr]
111 |     except:
112 |         return ""
113 | 
114 | 
115 | def normalized_edit_distance(s1: str, s2: str):
116 |     """
117 |     Returns the normalized edit distance
118 | 
119 |     :param s1: First string
120 |     :type s1: str
121 |     :param s2: Second string
122 |     :type s2: str
123 |     """
124 | 
125 |     dist = editdistance.eval(s1, s2)
126 |     max_length = max(len(s1), len(s2))
127 |     if max_length - dist == 0:
128 |         return 0.0
129 |     return (max_length - dist) / max_length
130 | 


--------------------------------------------------------------------------------
/textractor/visualizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .entitylist import EntityList
2 | 


--------------------------------------------------------------------------------
/textractor/visualizers/arial.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/textractor/visualizers/arial.ttf


--------------------------------------------------------------------------------
/tpipelinegeofinder/Manifest.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 
3 | recursive-exclude * __pycache__
4 | recursive-exclude * *.py[co] test_*.py
5 | 
6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif


--------------------------------------------------------------------------------
/tpipelinegeofinder/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.0.8
 3 | commit = False
 4 | tag = False
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:textractgeofinder/_version.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [aliases]
21 | 


--------------------------------------------------------------------------------
/tpipelinegeofinder/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from setuptools import setup, find_packages
 4 | 
 5 | 
 6 | def read(fname):
 7 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 8 | 
 9 | 
10 | requirements = ['amazon-textract-response-parser>=0.1.17']
11 | 
12 | if sys.argv[-1] == 'publish-test':
13 |     os.system(f"cd {os.path.dirname(__file__)}")
14 |     os.system('rm -rf dist/ build/ amazon_textract_geofinder.egg-info')
15 |     os.system('python setup.py sdist bdist_wheel')
16 |     os.system('twine check dist/*')
17 |     os.system('twine upload --repository pypitest dist/*')
18 |     sys.exit()
19 | 
20 | if sys.argv[-1] == 'publish':
21 |     os.system(f"cd {os.path.dirname(__file__)}")
22 |     os.system('rm -rf dist/ build/ amazon_textract_geofinder.egg-info/')
23 |     os.system('python setup.py sdist bdist_wheel')
24 |     os.system('twine check dist/*')
25 |     os.system('twine upload --repository pypi dist/*')
26 |     sys.exit()
27 | 
28 | setup(name='amazon-textract-geofinder',
29 |       packages=find_packages(exclude=['tests']),
30 |       include_package_data=True,
31 |       exclude_package_data={"": ["test_*.py", "__pycache__"]},
32 |       version='0.0.9',
33 |       description='Amazon Textract package to easier access data through geometric information',
34 |       install_requires=requirements,
35 |       scripts=['bin/amazon-textract-geofinder'],
36 |       long_description_content_type='text/markdown',
37 |       long_description=read('README.md'),
38 |       author='Amazon Rekognition Textract Demoes',
39 |       author_email='rekognition-textract-demos@amazon.com',
40 |       url='https://github.com/aws-samples/amazon-textract-textractor/tpipelinegeofinder',
41 |       keywords='amazon-textract-textractor amazon textract finder geometry geo',
42 |       license="Apache License Version 2.0",
43 |       classifiers=[
44 |           "Development Status :: 4 - Beta",
45 |           "Topic :: Utilities",
46 |           'License :: OSI Approved :: Apache Software License',
47 |           'Programming Language :: Python :: 3.6',
48 |           'Programming Language :: Python :: 3.7',
49 |           'Programming Language :: Python :: 3.8',
50 |           'Programming Language :: Python :: 3.9',
51 |           'Programming Language :: Python :: 3.10',
52 |           'Programming Language :: Python :: 3.11',
53 |           'Programming Language :: Python :: 3.12',
54 |       ],
55 |       python_requires='>=3.6')
56 | 


--------------------------------------------------------------------------------
/tpipelinegeofinder/tests/data/multi_page_example_file.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tpipelinegeofinder/tests/data/multi_page_example_file.pdf


--------------------------------------------------------------------------------
/tpipelinegeofinder/tests/data/patient_intake_form_sample.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tpipelinegeofinder/tests/data/patient_intake_form_sample.jpg


--------------------------------------------------------------------------------
/tpipelinegeofinder/tests/test_ocrdb.py:
--------------------------------------------------------------------------------
 1 | from textractgeofinder.ocrdb import OCRDB
 2 | from textractgeofinder.tword import TWord
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def test_creation(caplog):
 9 |     caplog.set_level(logging.DEBUG)
10 |     ocrdb = OCRDB.getInstance()
11 |     tword: TWord = TWord(text='sometext',
12 |                          original_text='SomeText',
13 |                          text_type='word',
14 |                          confidence=71.7424087524414,
15 |                          id='e5d9a27b-483c-4c8b-9d09-4092d050e2e4',
16 |                          xmin=100,
17 |                          ymin=0,
18 |                          xmax=263,
19 |                          ymax=22,
20 |                          page_number=1,
21 |                          doc_width=1080,
22 |                          doc_height=1920,
23 |                          child_relationships='',
24 |                          reference=None,
25 |                          resolver=None)
26 |     ocrdb.insert(textract_doc_uuid='bla', x=tword)
27 |     logger.debug(f"tword: {tword}")
28 | 


--------------------------------------------------------------------------------
/tpipelinegeofinder/tests/test_tword.py:
--------------------------------------------------------------------------------
 1 | from textractgeofinder.tword import TWord
 2 | import logging
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | def test_creation(caplog):
 8 |     caplog.set_level(logging.DEBUG)
 9 |     tword: TWord = TWord(text="test",
10 |                          text_type="text_type",
11 |                          confidence=99,
12 |                          id="test-id",
13 |                          page_number=1,
14 |                          ymin=1,
15 |                          ymax=1,
16 |                          xmin=10,
17 |                          xmax=10,
18 |                          original_text="original-text",
19 |                          doc_width=100,
20 |                          doc_height=100,
21 |                          reference="test")
22 |     logger.debug(f"tword: {tword}")
23 | 


--------------------------------------------------------------------------------
/tpipelinegeofinder/textractgeofinder/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import __version__
2 | 
3 | import logging
4 | from logging import NullHandler
5 | 
6 | logging.getLogger(__name__).addHandler(NullHandler())
7 | 


--------------------------------------------------------------------------------
/tpipelinegeofinder/textractgeofinder/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.8'
2 | 


--------------------------------------------------------------------------------
/tpipelinegeofinder/textractgeofinder/tinterface.py:
--------------------------------------------------------------------------------
 1 | from textractquery.tword import TWord
 2 | from typing import Optional
 3 | from enum import Enum
 4 | 
 5 | 
 6 | class Direction(Enum):
 7 |     UP = 1
 8 |     RIGHT = 2
 9 |     DOWN = 3
10 |     LEFT = 4
11 | 
12 | 
13 | class TInterface:
14 |     pass
15 | 


--------------------------------------------------------------------------------
/tpipelinepagedimensions/Manifest.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 
3 | recursive-exclude * __pycache__
4 | recursive-exclude * *.py[co] test_*.py
5 | 
6 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif


--------------------------------------------------------------------------------
/tpipelinepagedimensions/README.md:
--------------------------------------------------------------------------------
 1 | # Textract-Pipeline-PageDimensions
 2 | 
 3 | Provides functions to add page dimensions with doc_width and doc_height to the Textract JSON schema for the PAGE blocks under the custom attribute in the form of:
 4 | 
 5 | e. g.
 6 | 
 7 | ```
 8 | {'PageDimension': {'doc_width': 1549.0, 'doc_height': 370.0} }
 9 | ```
10 | 
11 | # Install
12 | 
13 | ```bash
14 | > python -m pip install amazon-textract-pipeline-pagedimensions
15 | ```
16 | 
17 | Make sure your environment is setup with AWS credentials through configuration files or environment variables or an attached role. (https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
18 | 
19 | # Samples
20 | 
21 | ## Add Page dimensions for a local file
22 | 
23 | sample uses amazon-textract-caller amazon-textract-pipeline-pagedimensions
24 | 
25 | ```bash
26 | python -m pip install amazon-textract-caller
27 | ```
28 | 
29 | ```python
30 | from textractpagedimensions.t_pagedimensions import add_page_dimensions
31 | from textractcaller.t_call import call_textract
32 | from trp.trp2 import TDocument, TDocumentSchema
33 | 
34 | j = call_textract(input_document='<path to some image file>')
35 | t_document: TDocument = TDocumentSchema().load(j)
36 | add_page_dimensions(t_document=t_document, input_document=input_file)
37 | print(t_document.pages[0].custom['PageDimension']) 
38 | # output will be something like this:
39 | # {
40 | #     'doc_width': 1544,
41 | #     'doc_height': 1065
42 | # }
43 | ```
44 | 
45 | ## Using the Amazon Textact Helper command line tool with PageDimensions
46 | 
47 | Together with the Amazon Textract Helper and Amazon Textract Response Parser, we can build a pipeline that includes information about PageDimension and Orientation of pages
48 | as a short demonstration on the information that is added to the Textract JSON.
49 | 
50 | ```bash
51 | > python -m pip install amazon-textract-helper amazon-textract-response-parser amazon-textract-pipeline-pagedimensions
52 | > amazon-textract --input-document "s3://amazon-textract-public-content/blogs/2-pager-different-dimensions.pdf" | amazon-textract-pipeline-pagedimensions --input-document "s3://amazon-textract-public-content/blogs/2-pager-different-dimensions.pdf"  | amazon-textract-pipeline --components add_page_orientation | jq '.Blocks[] | select(.BlockType=="PAGE") | .Custom'
53 | 
54 | {
55 |   "PageDimension": {
56 |     "doc_width": 1549,
57 |     "doc_height": 370
58 |   },
59 |   "Orientation": 0
60 | }
61 | {
62 |   "PageDimension": {
63 |     "doc_width": 1079,
64 |     "doc_height": 505
65 |   },
66 |   "Orientation": 0
67 | }
68 | ```
69 | 


--------------------------------------------------------------------------------
/tpipelinepagedimensions/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.0.9
 3 | commit = False
 4 | tag = False
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:textractpagedimensions/_version.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | 
20 | [aliases]
21 | 


--------------------------------------------------------------------------------
/tpipelinepagedimensions/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from setuptools import setup, find_packages
 4 | 
 5 | 
 6 | def read(fname):
 7 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 8 | 
 9 | 
10 | requirements = ['boto3', 'botocore', 'Pillow', 'pypdf>=3.1,<5.0']
11 | 
12 | if sys.argv[-1] == 'publish-test':
13 |     os.system(f"cd {os.path.dirname(__file__)}")
14 |     os.system('rm -rf dist/ build/ amazon_textract_pipeline_pagedimensions.egg-info/')
15 |     os.system('python setup.py sdist bdist_wheel')
16 |     os.system('twine check dist/*')
17 |     os.system('twine upload --repository pypitest dist/*')
18 |     sys.exit()
19 | 
20 | if sys.argv[-1] == 'publish':
21 |     os.system(f"cd {os.path.dirname(__file__)}")
22 |     os.system('rm -rf dist/ build/ amazon_textract_pipeline_pagedimensions.egg-info/')
23 |     os.system('python setup.py sdist bdist_wheel')
24 |     os.system('twine check dist/*')
25 |     os.system('twine upload --repository pypi dist/*')
26 |     sys.exit()
27 | 
28 | setup(name='amazon-textract-pipeline-pagedimensions',
29 |       packages=find_packages(exclude=['tests']),
30 |       include_package_data=True,
31 |       exclude_package_data={"": ["test_*.py", "__pycache__"]},
32 |       version='0.0.10',
33 |       description='Amazon Textract Pipeline Component to add page dimensions to page block types',
34 |       install_requires=requirements,
35 |       scripts=['bin/amazon-textract-pipeline-pagedimensions'],
36 |       long_description_content_type='text/markdown',
37 |       long_description=read('README.md'),
38 |       author='Amazon Rekognition Textract Demoes',
39 |       author_email='rekognition-textract-demos@amazon.com',
40 |       url='https://github.com/aws-samples/amazon-textract-textractor/tree/master/tpipelinepagedimensions',
41 |       keywords='amazon-textract-textractor amazon textract textractor pipeline page dimensions',
42 |       license="Apache License Version 2.0",
43 |       classifiers=[
44 |           "Development Status :: 4 - Beta",
45 |           "Topic :: Utilities",
46 |           'License :: OSI Approved :: Apache Software License',
47 |           'Programming Language :: Python :: 3.6',
48 |           'Programming Language :: Python :: 3.7',
49 |           'Programming Language :: Python :: 3.8',
50 |           'Programming Language :: Python :: 3.9',
51 |           'Programming Language :: Python :: 3.10',
52 |           'Programming Language :: Python :: 3.11',
53 |           'Programming Language :: Python :: 3.12',
54 |       ],
55 |       python_requires='>=3.6')
56 | 


--------------------------------------------------------------------------------
/tpipelinepagedimensions/tests/data/Textract-orginal-2021-05-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-textractor/f34c34a6c75dd4675642e52c1bad5a6c0eaf8ee4/tpipelinepagedimensions/tests/data/Textract-orginal-2021-05-10.png


--------------------------------------------------------------------------------
/tpipelinepagedimensions/tests/test_pagedimensions.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import boto3
 3 | import logging
 4 | 
 5 | from typing import List
 6 | from textractpagedimensions.t_pagedimensions import add_page_dimensions
 7 | from textractcaller.t_call import call_textract
 8 | from trp.trp2 import TDocument, TDocumentSchema, TBlock
 9 | 
10 | 
11 | def test_dimensions_from_file():
12 |     SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
13 |     input_file = os.path.join(SCRIPT_DIR, "data/Textract-orginal-2021-05-10.png")
14 |     j = call_textract(input_document=input_file)
15 |     t_document: TDocument = TDocumentSchema().load(j)
16 |     add_page_dimensions(t_document=t_document, input_document=input_file)
17 |     assert t_document.pages[0].custom['PageDimension'] == {'doc_width': 1544, 'doc_height': 1065}
18 | 
19 | 
20 | def test_dimensions_from_tiff(caplog):
21 |     caplog.set_level(logging.DEBUG, logger="textractcaller")
22 |     textract_client = boto3.client('textract', region_name='us-east-2')
23 |     input_file = "s3://amazon-textract-public-content/blogs/multipage_tiff_example_small.tiff"
24 |     j = call_textract(input_document=input_file, force_async_api=True, boto3_textract_client=textract_client)
25 |     t_document: TDocument = TDocumentSchema().load(j)
26 |     add_page_dimensions(t_document=t_document, input_document=input_file)
27 |     assert t_document.pages[0].custom['PageDimension'] == {'doc_width': 1333.0, 'doc_height': 1000.0}
28 |     assert t_document.pages[1].custom['PageDimension'] == {'doc_width': 1362.0, 'doc_height': 1038.0}
29 | 
30 | 
31 | def test_s3():
32 |     textract_client = boto3.client('textract', region_name='us-east-2')
33 |     input_file = "s3://amazon-textract-public-content/blogs/2-pager-different-dimensions.pdf"
34 |     j = call_textract(input_document=input_file, boto3_textract_client=textract_client)
35 |     t_document: TDocument = TDocumentSchema().load(j)
36 |     add_page_dimensions(t_document=t_document, input_document=input_file)
37 |     pages: List[TBlock] = t_document.pages
38 |     pages[0].custom['PageDimension'] == {'doc_width': 1549.0, 'doc_height': 370.0}
39 |     pages[1].custom['PageDimension'] == {'doc_width': 1079.0, 'doc_height': 505.0}
40 | 
41 | 
42 | def test_dimensions_from_bytes():
43 |     SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
44 |     input_file = os.path.join(SCRIPT_DIR, "data/Textract-orginal-2021-05-10.png")
45 |     with open(input_file, 'rb') as input_document_file:
46 |         input_document = input_document_file.read()
47 |         j = call_textract(input_document=input_document)
48 |         # with open("output.json", 'w') as outfilebla:
49 |         #     json.dump(obj=j, fp=outfilebla)
50 |         t_document: TDocument = TDocumentSchema().load(j)
51 | 
52 |     with open(input_file, 'rb') as input_document_file:
53 |         add_page_dimensions(t_document=t_document, input_document=input_document_file.read())
54 |         assert t_document.pages[0].custom['PageDimension'] == {'doc_width': 1544, 'doc_height': 1065}
55 | 


--------------------------------------------------------------------------------
/tpipelinepagedimensions/textractpagedimensions/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import __version__
2 | 
3 | import logging
4 | from logging import NullHandler
5 | 
6 | logging.getLogger(__name__).addHandler(NullHandler())
7 | 


--------------------------------------------------------------------------------
/tpipelinepagedimensions/textractpagedimensions/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.9'
2 | 


--------------------------------------------------------------------------------
/tpipelinepagedimensions/textractpagedimensions/t_pagedimensions.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import trp.trp2 as t2
  3 | import os
  4 | from typing import List, Union
  5 | from dataclasses import dataclass, asdict
  6 | from PIL import Image, ImageSequence
  7 | from pypdf import PdfReader
  8 | import boto3
  9 | import io
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | only_async_suffixes = ['.pdf']
 14 | tiff_suffixes = ['.tiff', '.tif']
 15 | sync_suffixes = ['.png', '.jpg', '.jpeg'] + tiff_suffixes
 16 | supported_suffixes = only_async_suffixes + sync_suffixes
 17 | 
 18 | 
 19 | @dataclass
 20 | class DocumentDimensions():
 21 |     doc_width: float
 22 |     doc_height: float
 23 | 
 24 | 
 25 | def get_size_from_filestream(fs, ext) -> List[DocumentDimensions]:
 26 |     return_value: List[DocumentDimensions] = list()
 27 |     if ext in only_async_suffixes:
 28 |         # TODO: assumes the order of pages in blocks is correct, when calling Textract with bytes the block.page is empty
 29 |         input1 = PdfReader(fs)
 30 |         for page in input1.pages:
 31 |             pdf_page = page.mediabox
 32 |             return_value.append(DocumentDimensions(doc_width=float(pdf_page[2]), doc_height=float(pdf_page[3])))
 33 |     else:
 34 |         img = Image.open(fs)
 35 |         for _, page in enumerate(ImageSequence.Iterator(img)):
 36 |             return_value.append(DocumentDimensions(doc_width=float(page.width), doc_height=float(page.height)))
 37 |     return return_value
 38 | 
 39 | 
 40 | def get_size_from_s3(s3_bucket, s3_key):
 41 |     _, ext = os.path.splitext(s3_key)
 42 |     if ext in supported_suffixes:
 43 |         s3 = boto3.client('s3')
 44 |         o = s3.get_object(Bucket=s3_bucket, Key=s3_key)
 45 |         input_bytes = o.get('Body').read()
 46 |         f = io.BytesIO(input_bytes)
 47 |         return get_size_from_filestream(f, ext)
 48 |     else:
 49 |         raise ValueError(f"{s3_key} not in {supported_suffixes}")
 50 | 
 51 | 
 52 | def get_width_height_from_s3_object(s3_bucket, s3_key):
 53 |     return get_size_from_s3(s3_bucket, s3_key)
 54 | 
 55 | 
 56 | def get_width_height_from_file(filepath):
 57 |     _, ext = os.path.splitext(filepath)
 58 |     if ext in supported_suffixes:
 59 |         with open(filepath, 'rb') as input_fs:
 60 |             return get_size_from_filestream(input_fs, ext)
 61 |     else:
 62 |         raise ValueError(f"{filepath} not in {supported_suffixes}")
 63 | 
 64 | 
 65 | def add_page_dimensions(t_document: t2.TDocument, input_document: Union[str, bytes]) -> t2.TDocument:
 66 |     """
 67 |     adds Page Dimensions to each page of the document in the form of a custom property on the Block
 68 |     e. g. {'PageDimension': {'doc_width': 1549.0, 'doc_height': 370.0} }
 69 | 
 70 |     """
 71 |     page_dimensions: List[DocumentDimensions] = list()
 72 | 
 73 |     if isinstance(input_document, str):
 74 |         if len(input_document) > 7 and input_document.lower().startswith("s3://"):
 75 |             input_document = input_document.replace("s3://", "")
 76 |             s3_bucket, s3_key = input_document.split("/", 1)
 77 |             page_dimensions = get_width_height_from_s3_object(s3_bucket=s3_bucket, s3_key=s3_key)
 78 |         else:
 79 |             page_dimensions = get_width_height_from_file(filepath=input_document)
 80 | 
 81 |     elif isinstance(input_document, (bytes, bytearray)):
 82 |         page_dimensions = get_size_from_filestream(io.BytesIO(input_document), ext=None)
 83 |     # bytes do not return a page for the Block, cannot use the mapping logic as above
 84 |     if len(t_document.pages) != len(page_dimensions):
 85 |         raise AssertionError(
 86 |             f"number of pages in document did not match number of dimensions received: document-pages: {len(t_document.pages)}, dimension-pages: {len(page_dimensions)}"
 87 |         )
 88 |     for idx, block in enumerate(t_document.pages):
 89 |         if block.custom:
 90 |             if block.page:
 91 |                 block.custom['PageDimension'] = asdict(page_dimensions[block.page - 1])
 92 |             else:
 93 |                 block.custom['PageDimension'] = asdict(page_dimensions[idx])
 94 |         else:
 95 |             if block.page:
 96 |                 block.custom = {'PageDimension': asdict(page_dimensions[block.page - 1])}
 97 |             else:
 98 |                 block.custom = {'PageDimension': asdict(page_dimensions[idx])}
 99 | 
100 |     return t_document
101 | 


--------------------------------------------------------------------------------