├── .DS_Store
├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── python
    ├── .DS_Store
    ├── 01-detect-text-local.py
    ├── 02-detect-text-s3.py
    ├── 03-reading-order.py
    ├── 04-nlp-comprehend.py
    ├── 05-nlp-medical.py
    ├── 06-translate.py
    ├── 07-search.py
    ├── 08-forms.py
    ├── 09-forms-redaction.py
    ├── 10-tables.py
    ├── 11-tables-expense.py
    ├── 12-pdf-text.py
    ├── 13-signature.py
    ├── Amazon-Textract-Pdf.pdf
    ├── Analyze_Lending_Sample.ipynb
    ├── OneKeyValue.png
    ├── OneLine.png
    ├── Textract-Analyze-ID.ipynb
    ├── Textract-MergeCell-Statement.pdf
    ├── Textract-Table-Merged-Cells-And-Headers.ipynb
    ├── Textract.ipynb
    ├── custom-queries
    │   ├── custom-queries-checks-blog.ipynb
    │   ├── samples
    │   │   ├── checks-annotations.zip
    │   │   └── checks-samples.zip
    │   └── screenshots
    │   │   ├── checks-notebook-step1.png
    │   │   ├── checks-notebook-step2.png
    │   │   ├── checks-notebook-step5_1.png
    │   │   ├── checks-notebook-step6.png
    │   │   ├── checks-notebook-step7.png
    │   │   └── checks-notebook-step8.png
    ├── employmentapp.png
    ├── expense.png
    ├── extraction-parsers
    │   ├── cms1500-parser.ipynb
    │   ├── samples
    │   │   ├── CMS1500-sample.png
    │   │   └── ub-04-Form-sample.png
    │   └── ub04-parser.ipynb
    ├── medical-notes.png
    ├── patient_intake_form_sample.jpg
    ├── queries
    │   ├── insurance-card.ipynb
    │   ├── insurance-card.png
    │   ├── mortgage-note.ipynb
    │   ├── mortgage-note.jpg
    │   ├── paystub-questions_full.csv
    │   ├── paystub-questions_subset.csv
    │   ├── paystub.ipynb
    │   ├── paystub.jpg
    │   ├── vaccination-card-s3-object.ipynb
    │   ├── vaccination-card.ipynb
    │   └── vaccination.png
    ├── simple-document-image.jpg
    ├── textract-textractor-tools.ipynb
    ├── two-column-image.jpg
    └── verification-of-employment.png
└── src-csharp
    ├── .gitignore
    ├── ArgHandlers
        ├── DetectTextHandler.cs
        ├── DetectTextS3Handler.cs
        ├── FormsHandler.cs
        ├── FormsRedactionHandler.cs
        ├── NlpComprehendHandler.cs
        ├── NlpComprehendMedicalHandler.cs
        ├── PdfTextHandler.cs
        ├── ReadingOrderHandler.cs
        ├── SearchHandler.cs
        ├── TablesExpenseHandler.cs
        ├── TablesHandler.cs
        └── TranslateHandler.cs
    ├── Program.cs
    ├── Readme.md
    ├── Services
        ├── Column.cs
        ├── ComprehendService.cs
        ├── ElasticSearchService.cs
        ├── IndexedText.cs
        ├── TextractTextAnalysisService.cs
        ├── TextractTextDetectionService.cs
        └── TranslateService.cs
    ├── TextractExtensions
        ├── Cell.cs
        ├── Field.cs
        ├── FieldKey.cs
        ├── FieldValue.cs
        ├── Form.cs
        ├── Line.cs
        ├── NewBoundingBox.cs
        ├── NewGeometry.cs
        ├── Page.cs
        ├── Row.cs
        ├── SelectionElement.cs
        ├── Table.cs
        ├── TextractDocument.cs
        └── Word.cs
    ├── appsettings.json
    ├── dotnet-core.csproj
    └── test-files
        ├── Amazon-Textract-Pdf.pdf
        ├── employmentapp.png
        ├── expense.png
        ├── medical-notes.png
        ├── redacted-employmentapp.png
        ├── simple-document-image.jpg
        └── two-column-image.jpg


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/.DS_Store


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .python-version
3 | 
4 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/aws-samples/amazon-textract-code-samples/issues), or [recently closed](https://github.com/aws-samples/amazon-textract-code-samples/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/amazon-textract-code-samples/labels/help%20wanted) issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/aws-samples/amazon-textract-code-samples/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Amazon Textract Code Samples
 2 | 
 3 | This repository contains example code snippets showing how Amazon Textract and other AWS services can be used to get insights from documents.
 4 | 
 5 | ## Usage
 6 | 
 7 | python3 01-detect-text-local.py
 8 | 
 9 | For examples that use S3 bucket, upload sample images to an S3 bucket and update variable "s3BucketName" in the example before running it.
10 | 
11 | ## Python Samples
12 | 
13 | | Argument                                                    | Description                                                |
14 | | ----------------------------------------------------------- | ---------------------------------------------------------- |
15 | | [01-detect-text-local.py](./python/01-detect-text-local.py) | Example showing processing a document on local machine.    |
16 | | [02-detect-text-s3.py](./python/02-detect-text-s3.py)       | Example showing processing a document in Amazon S3 bucket. |
17 | | [03-reading-order.py](./python/03-reading-order.py)         | Example showing printing document in reading order.        |
18 | | [04-nlp-comprehend.py](./python/04-nlp-comprehend.py)       | Example showing detecting entities and sentiment.          |
19 | | [05-nlp-medical.py](./python/05-nlp-medical.py)             | Example showing detecting medical entities.                |
20 | | [06-translate.py](./python/06-translate.py)                 | Example showing translation of documents.                  |
21 | | [07-search.py](./python/07-search.py)                       | Example showing document indexing in Elasticsearch.        |
22 | | [08-forms.py](./python/08-forms.py)                         | Example showing form (key/value) processing.               |
23 | | [09-forms-redaction.py](./python/09-forms-redaction.py)     | Example showing redacting information in document.         |
24 | | [10-tables.py](./python/10-tables.py)                       | Example showing table processing.                          |
25 | | [11-tables-expense.py](./python/11-tables-expense.py)       | Example showing validation of table data.                  |
26 | | [12-pdf-text.py](./python/12-pdf-text.py)                   | Example showing PDF document processing.                   |
27 | 
28 | ## .NET Usage
29 | 
30 | ```
31 | Usage: dotnet run [--switch]
32 | To run this console app, use the following valid switches one at a time:
33 |                      --detect-text-local
34 |                      --detect-text-s3
35 |                      --pdf-text
36 |                      --forms
37 |                      --forms-redaction
38 |                      --tables
39 |                      --tables-expense
40 |                      --reading-order
41 |                      --nlp-comprehend
42 |                      --nlp-medical
43 |                      --translate
44 |                      --search
45 |       e.g. dotnet run --detect-text-s3
46 | ```
47 | 
48 | ## .NET Samples
49 | 
50 | Go to `src-csharp` folder for .NET samples
51 | 
52 | | Argument            | Description                                                |
53 | | ------------------- | ---------------------------------------------------------- |
54 | | --detect-text-local | Example showing processing a document on local machine.    |
55 | | --detect-text-s3    | Example showing processing a document in Amazon S3 bucket. |
56 | | --pdf-text          | Example showing PDF document processing.                   |
57 | | --forms             | Example showing form (key/value) processing.               |
58 | | --forms-redaction   | Example showing redacting information in document.         |
59 | | --tables            | Example showing table processing.                          |
60 | | --tables-expense    | Example showing validation of table data.                  |
61 | | --reading-order     | Example showing printing document in reading order.        |
62 | | --nlp-comprehend    | Example showing detecting entities and sentiment.          |
63 | | --nlp-medical       | Example showing detecting medical entities.                |
64 | | --translate         | Example showing translation of documents.                  |
65 | | --search            | Example showing document indexing in Elasticsearch.        |
66 | 
67 | ## Other Resources
68 | 
69 | - [Large scale document processing with Amazon Textract - Reference Architecture](https://github.com/aws-samples/amazon-textract-serverless-large-scale-document-processing)
70 | - [Batch processing tool](https://github.com/aws-samples/amazon-textract-textractor)
71 | - [JSON response parser](https://github.com/aws-samples/amazon-textract-response-parser)
72 | 
73 | ## License Summary
74 | 
75 | This sample code is made available under the MIT-0 license. See the LICENSE file.
76 | 


--------------------------------------------------------------------------------
/python/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/.DS_Store


--------------------------------------------------------------------------------
/python/01-detect-text-local.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | 
 3 | # Document
 4 | documentName = "simple-document-image.jpg"
 5 | 
 6 | # Read document content
 7 | with open(documentName, 'rb') as document:
 8 |     imageBytes = bytearray(document.read())
 9 | 
10 | # Amazon Textract client
11 | textract = boto3.client('textract')
12 | 
13 | # Call Amazon Textract
14 | response = textract.detect_document_text(Document={'Bytes': imageBytes})
15 | 
16 | #print(response)
17 | 
18 | # Print detected text
19 | for item in response["Blocks"]:
20 |     if item["BlockType"] == "LINE":
21 |         print ('\033[94m' +  item["Text"] + '\033[0m')
22 | 


--------------------------------------------------------------------------------
/python/02-detect-text-s3.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | 
 3 | # Document
 4 | s3BucketName = "ki-textract-demo-docs"
 5 | documentName = "simple-document-image.jpg"
 6 | 
 7 | # Amazon Textract client
 8 | textract = boto3.client('textract')
 9 | 
10 | # Call Amazon Textract
11 | response = textract.detect_document_text(
12 |     Document={
13 |         'S3Object': {
14 |             'Bucket': s3BucketName,
15 |             'Name': documentName
16 |         }
17 |     })
18 | 
19 | #print(response)
20 | 
21 | # Print detected text
22 | for item in response["Blocks"]:
23 |     if item["BlockType"] == "LINE":
24 |         print ('\033[94m' +  item["Text"] + '\033[0m')
25 | 


--------------------------------------------------------------------------------
/python/03-reading-order.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | 
 3 | # Document
 4 | documentName = "two-column-image.jpg"
 5 | 
 6 | # Amazon Textract client
 7 | textract = boto3.client('textract')
 8 | 
 9 | # Call Amazon Textract
10 | with open(documentName, "rb") as document:
11 |     response = textract.detect_document_text(
12 |         Document={
13 |             'Bytes': document.read(),
14 |         }
15 |     )
16 | 
17 | #print(response)
18 | 
19 | # Detect columns and print lines
20 | columns = []
21 | lines = []
22 | for item in response["Blocks"]:
23 |       if item["BlockType"] == "LINE":
24 |         column_found=False
25 |         for index, column in enumerate(columns):
26 |             bbox_left = item["Geometry"]["BoundingBox"]["Left"]
27 |             bbox_right = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]
28 |             bbox_centre = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]/2
29 |             column_centre = column['left'] + column['right']/2
30 | 
31 |             if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right):
32 |                 #Bbox appears inside the column
33 |                 lines.append([index, item["Text"]])
34 |                 column_found=True
35 |                 break
36 |         if not column_found:
37 |             columns.append({'left':item["Geometry"]["BoundingBox"]["Left"], 'right':item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]})
38 |             lines.append([len(columns)-1, item["Text"]])
39 | 
40 | lines.sort(key=lambda x: x[0])
41 | for line in lines:
42 |     print (line[1])
43 | 


--------------------------------------------------------------------------------
/python/04-nlp-comprehend.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | 
 3 | # Document
 4 | documentName = "simple-document-image.jpg"
 5 | 
 6 | # Amazon Textract client
 7 | textract = boto3.client('textract')
 8 | 
 9 | # Call Amazon Textract
10 | with open(documentName, "rb") as document:
11 |     response = textract.detect_document_text(
12 |         Document={
13 |             'Bytes': document.read(),
14 |         }
15 |     )
16 | 
17 | #print(response)
18 | 
19 | # Print text
20 | print("\nText\n========")
21 | text = ""
22 | for item in response["Blocks"]:
23 |     if item["BlockType"] == "LINE":
24 |         print ('\033[94m' +  item["Text"] + '\033[0m')
25 |         text = text + " " + item["Text"]
26 | 
27 | # Amazon Comprehend client
28 | comprehend = boto3.client('comprehend')
29 | 
30 | # Detect sentiment
31 | sentiment =  comprehend.detect_sentiment(LanguageCode="en", Text=text)
32 | print ("\nSentiment\n========\n{}".format(sentiment.get('Sentiment')))
33 | 
34 | # Detect entities
35 | entities =  comprehend.detect_entities(LanguageCode="en", Text=text)
36 | print("\nEntities\n========")
37 | for entity in entities["Entities"]:
38 |     print ("{}\t=>\t{}".format(entity["Type"], entity["Text"]))
39 | 


--------------------------------------------------------------------------------
/python/05-nlp-medical.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | 
 3 | # Document
 4 | documentName = "medical-notes.png"
 5 | 
 6 | # Amazon Textract client
 7 | textract = boto3.client('textract')
 8 | 
 9 | # Call Amazon Textract
10 | with open(documentName, "rb") as document:
11 |     response = textract.detect_document_text(
12 |         Document={
13 |             'Bytes': document.read(),
14 |         }
15 |     )
16 | 
17 | #print(response)
18 | 
19 | # Print text
20 | print("\nText\n========")
21 | text = ""
22 | for item in response["Blocks"]:
23 |     if item["BlockType"] == "LINE":
24 |         print ('\033[94m' +  item["Text"] + '\033[0m')
25 |         text = text + " " + item["Text"]
26 | 
27 | # Amazon Comprehend client
28 | comprehend = boto3.client('comprehendmedical')
29 | 
30 | # Detect medical entities
31 | entities =  comprehend.detect_entities(Text=text)
32 | print("\nMedical Entities\n========")
33 | for entity in entities["Entities"]:
34 |     print("- {}".format(entity["Text"]))
35 |     print ("   Type: {}".format(entity["Type"]))
36 |     print ("   Category: {}".format(entity["Category"]))
37 |     if(entity["Traits"]):
38 |         print("   Traits:")
39 |         for trait in entity["Traits"]:
40 |             print ("    - {}".format(trait["Name"]))
41 |     print("\n")
42 | 


--------------------------------------------------------------------------------
/python/06-translate.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | 
 3 | # Document
 4 | documentName = "simple-document-image.jpg"
 5 | 
 6 | # Amazon Textract client
 7 | textract = boto3.client('textract')
 8 | 
 9 | # Call Amazon Textract
10 | with open(documentName, "rb") as document:
11 |     response = textract.detect_document_text(
12 |         Document={
13 |             'Bytes': document.read(),
14 |         }
15 |     )
16 | 
17 | #print(response)
18 | 
19 | # Amazon Translate client
20 | translate = boto3.client('translate')
21 | 
22 | print ('')
23 | for item in response["Blocks"]:
24 |     if item["BlockType"] == "LINE":
25 |         print ('\033[94m' +  item["Text"] + '\033[0m')
26 |         result = translate.translate_text(Text=item["Text"], SourceLanguageCode="en", TargetLanguageCode="de")
27 |         print ('\033[92m' + result.get('TranslatedText') + '\033[0m')
28 |     print ('')
29 | 


--------------------------------------------------------------------------------
/python/07-search.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from elasticsearch import Elasticsearch, RequestsHttpConnection
 3 | from requests_aws4auth import AWS4Auth
 4 | 
 5 | def indexDocument(bucketName, objectName, text):
 6 | 
 7 |     # Update host with endpoint of your Elasticsearch cluster
 8 |     #host = "search--xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com
 9 |     host = "searchxxxxxxxxxxxxxxxx.us-east-1.es.amazonaws.com"
10 |     region = 'us-east-1'
11 | 
12 |     if(text):
13 |         service = 'es'
14 |         ss = boto3.Session()
15 |         credentials = ss.get_credentials()
16 |         region = ss.region_name
17 | 
18 |         awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)
19 | 
20 |         es = Elasticsearch(
21 |             hosts = [{'host': host, 'port': 443}],
22 |             http_auth = awsauth,
23 |             use_ssl = True,
24 |             verify_certs = True,
25 |             connection_class = RequestsHttpConnection
26 |         )
27 | 
28 |         document = {
29 |             "name": "{}".format(objectName),
30 |             "bucket" : "{}".format(bucketName),
31 |             "content" : text
32 |         }
33 | 
34 |         es.index(index="textract", doc_type="document", id=objectName, body=document)
35 | 
36 |         print("Indexed document: {}".format(objectName))
37 | 
38 | # Document
39 | s3BucketName = "ki-textract-demo-docs"
40 | documentName = "simple-document-image.jpg"
41 | 
42 | # Amazon Textract client
43 | textract = boto3.client('textract')
44 | 
45 | # Call Amazon Textract
46 | response = textract.detect_document_text(
47 |     Document={
48 |         'S3Object': {
49 |             'Bucket': s3BucketName,
50 |             'Name': documentName
51 |         }
52 |     })
53 | 
54 | #print(response)
55 | 
56 | # Print detected text
57 | text = ""
58 | for item in response["Blocks"]:
59 |     if item["BlockType"] == "LINE":
60 |         print ('\033[94m' +  item["Text"] + '\033[0m')
61 |         text += item["Text"]
62 | 
63 | indexDocument(s3BucketName, documentName, text)
64 | 
65 | # You can view index documents in Kibana Dashboard


--------------------------------------------------------------------------------
/python/08-forms.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from trp import Document
 3 | 
 4 | # Document
 5 | documentName = "employmentapp.png"
 6 | 
 7 | # Amazon Textract client
 8 | textract = boto3.client('textract')
 9 | 
10 | # Call Amazon Textract
11 | with open(documentName, "rb") as document:
12 |     response = textract.analyze_document(
13 |         Document={
14 |             'Bytes': document.read(),
15 |         },
16 |         FeatureTypes=["FORMS"])
17 | 
18 | #print(response)
19 | 
20 | doc = Document(response)
21 | 
22 | for page in doc.pages:
23 |     # Print fields
24 |     print("Fields:")
25 |     for field in page.form.fields:
26 |         print("Key: {}, Value: {}".format(field.key, field.value))
27 | 
28 |     # Get field by key
29 |     print("\nGet Field by Key:")
30 |     key = "Phone Number:"
31 |     field = page.form.getFieldByKey(key)
32 |     if(field):
33 |         print("Key: {}, Value: {}".format(field.key, field.value))
34 | 
35 |     # Search fields by key
36 |     print("\nSearch Fields:")
37 |     key = "address"
38 |     fields = page.form.searchFieldsByKey(key)
39 |     for field in fields:
40 |         print("Key: {}, Value: {}".format(field.key, field.value))
41 | 


--------------------------------------------------------------------------------
/python/09-forms-redaction.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from trp import Document
 3 | from PIL import Image, ImageDraw
 4 | 
 5 | # Document
 6 | documentName = "employmentapp.png"
 7 | 
 8 | # Amazon Textract client
 9 | textract = boto3.client('textract')
10 | 
11 | # Call Amazon Textract
12 | with open(documentName, "rb") as document:
13 |     response = textract.analyze_document(
14 |         Document={
15 |             'Bytes': document.read(),
16 |         },
17 |         FeatureTypes=["FORMS"])
18 | 
19 | #print(response)
20 | 
21 | doc = Document(response)
22 | 
23 | # Redact document
24 | img = Image.open(documentName)
25 | 
26 | width, height = img.size
27 | 
28 | if(doc.pages):
29 |     page = doc.pages[0]
30 |     for field in page.form.fields:
31 |         if(field.key and field.value and "address" in field.key.text.lower()):
32 |         #if(field.key and field.value):
33 |             print("Redacting => Key: {}, Value: {}".format(field.key.text, field.value.text))
34 |             
35 |             x1 = field.value.geometry.boundingBox.left*width
36 |             y1 = field.value.geometry.boundingBox.top*height-2
37 |             x2 = x1 + (field.value.geometry.boundingBox.width*width)+5
38 |             y2 = y1 + (field.value.geometry.boundingBox.height*height)+2
39 | 
40 |             draw = ImageDraw.Draw(img)
41 |             draw.rectangle([x1, y1, x2, y2], fill="Black")
42 | 
43 | img.save("redacted-{}".format(documentName))
44 | 


--------------------------------------------------------------------------------
/python/10-tables.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from trp import Document
 3 | 
 4 | # Document
 5 | documentName = "employmentapp.png"
 6 | 
 7 | # Amazon Textract client
 8 | textract = boto3.client('textract')
 9 | 
10 | # Call Amazon Textract
11 | with open(documentName, "rb") as document:
12 |     response = textract.analyze_document(
13 |         Document={
14 |             'Bytes': document.read(),
15 |         },
16 |         FeatureTypes=["TABLES"])
17 | 
18 | #print(response)
19 | 
20 | doc = Document(response)
21 | 
22 | for page in doc.pages:
23 |      # Print tables
24 |     for table in page.tables:
25 |         for r, row in enumerate(table.rows):
26 |             for c, cell in enumerate(row.cells):
27 |                 print("Table[{}][{}] = {}".format(r, c, cell.text))
28 | 


--------------------------------------------------------------------------------
/python/11-tables-expense.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from trp import Document
 3 | 
 4 | # Document
 5 | documentName = "expense.png"
 6 | 
 7 | # Amazon Textract client
 8 | textract = boto3.client('textract')
 9 | 
10 | # Call Amazon Textract
11 | with open(documentName, "rb") as document:
12 |     response = textract.analyze_document(
13 |         Document={
14 |             'Bytes': document.read(),
15 |         },
16 |         FeatureTypes=["TABLES"])
17 | 
18 | #print(response)
19 | 
20 | doc = Document(response)
21 | 
22 | def isFloat(input):
23 |   try:
24 |     float(input)
25 |   except ValueError:
26 |     return False
27 |   return True
28 | 
29 | warning = ""
30 | for page in doc.pages:
31 |      # Print tables
32 |     for table in page.tables:
33 |         for r, row in enumerate(table.rows):
34 |             itemName  = ""
35 |             for c, cell in enumerate(row.cells):
36 |                 print("Table[{}][{}] = {}".format(r, c, cell.text))
37 |                 if(c == 0):
38 |                     itemName = cell.text
39 |                 elif(c == 4 and isFloat(cell.text)):
40 |                     value = float(cell.text)
41 |                     if(value > 1000):
42 |                         warning += "{} is greater than $1000.".format(itemName)
43 | if(warning):
44 |     print("\nReview needed:\n====================\n" + warning)
45 | 


--------------------------------------------------------------------------------
/python/12-pdf-text.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import time
 3 | 
 4 | 
 5 | def start_job(client, s3_bucket_name, object_name):
 6 |     response = None
 7 |     response = client.start_document_text_detection(
 8 |         DocumentLocation={
 9 |             'S3Object': {
10 |                 'Bucket': s3_bucket_name,
11 |                 'Name': object_name
12 |             }})
13 | 
14 |     return response["JobId"]
15 | 
16 | 
17 | def is_job_complete(client, job_id):
18 |     time.sleep(1)
19 |     response = client.get_document_text_detection(JobId=job_id)
20 |     status = response["JobStatus"]
21 |     print("Job status: {}".format(status))
22 | 
23 |     while(status == "IN_PROGRESS"):
24 |         time.sleep(1)
25 |         response = client.get_document_text_detection(JobId=job_id)
26 |         status = response["JobStatus"]
27 |         print("Job status: {}".format(status))
28 | 
29 |     return status
30 | 
31 | 
32 | def get_job_results(client, job_id):
33 |     pages = []
34 |     time.sleep(1)
35 |     response = client.get_document_text_detection(JobId=job_id)
36 |     pages.append(response)
37 |     print("Resultset page received: {}".format(len(pages)))
38 |     next_token = None
39 |     if 'NextToken' in response:
40 |         next_token = response['NextToken']
41 | 
42 |     while next_token:
43 |         time.sleep(1)
44 |         response = client.\
45 |             get_document_text_detection(JobId=job_id, NextToken=next_token)
46 |         pages.append(response)
47 |         print("Resultset page received: {}".format(len(pages)))
48 |         next_token = None
49 |         if 'NextToken' in response:
50 |             next_token = response['NextToken']
51 | 
52 |     return pages
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     # Document
57 |     s3_bucket_name = "ki-textract-demo-docs"
58 |     document_name = "Amazon-Textract-Pdf.pdf"
59 |     region = "us-east-1"
60 |     client = boto3.client('textract', region_name=region)
61 | 
62 |     job_id = start_job(client, s3_bucket_name, document_name)
63 |     print("Started job with id: {}".format(job_id))
64 |     if is_job_complete(client, job_id):
65 |         response = get_job_results(client, job_id)
66 | 
67 |     # print(response)
68 | 
69 |     # Print detected text
70 |     for result_page in response:
71 |         for item in result_page["Blocks"]:
72 |             if item["BlockType"] == "LINE":
73 |                 print('\033[94m' + item["Text"] + '\033[0m')
74 | 


--------------------------------------------------------------------------------
/python/13-signature.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import json
 3 | from trp import Document
 4 | from tabulate import tabulate
 5 | 
 6 | #create a Textract Client
 7 | textract = boto3.client('textract')
 8 | #Document
 9 | documentName = image_filename
10 | 
11 | response = None
12 | with open(image_filename, 'rb') as document:
13 |     imageBytes = bytearray(document.read())
14 | 
15 | # Call Textract AnalyzeDocument by passing a document from local disk
16 | response = textract.analyze_document(
17 |     Document={'Bytes': imageBytes},
18 |     FeatureTypes=["FORMS",'SIGNATURES']
19 | )
20 | 
21 | #print detected text
22 | d = []
23 | for item in response["Blocks"]:
24 |     if item["BlockType"] == "SIGNATURE":
25 |         d.append([item["Id"],item["Geometry"]])
26 | 
27 | print(tabulate(d, headers=["Id", "Geometry"],tablefmt="grid", maxcolwidths= [None,100]))
28 | 
29 | 
30 | doc = Document(response)
31 | d = []
32 | 
33 | for page in doc.pages:
34 |     # Search fields by key
35 |     print("\nSearch Fields:")
36 |     key = "Signature"
37 |     fields = page.form.searchFieldsByKey(key)
38 |     for field in fields:
39 |         d.append([field.key, field.value])        
40 | 
41 | print(tabulate(d, headers=["Key", "Value"]))
42 | 


--------------------------------------------------------------------------------
/python/Amazon-Textract-Pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/Amazon-Textract-Pdf.pdf


--------------------------------------------------------------------------------
/python/OneKeyValue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/OneKeyValue.png


--------------------------------------------------------------------------------
/python/OneLine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/OneLine.png


--------------------------------------------------------------------------------
/python/Textract-Analyze-ID.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "602739d2",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "### Amazon Textract Analyze ID\n",
  9 |     "\n",
 10 |     "Amazon Textract Analyze ID will help you automatically extract information from identification documents, such as driver’s licenses and passports. Amazon Textract uses AI and ML technologies to extract information from identity documents, such as U.S. passports and driver’s licenses, without the need for templates or configuration. You can automatically extract specific information, such as date of expiry and date of birth, as well as intelligently identify and extract implied information, such as name and address."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "id": "f1cc2940",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "Installing the caller to simplify calling Analyze ID"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "id": "107b34fb",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "!python -m pip install -q amazon-textract-caller --upgrade"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "10c0b980",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "Also upgrade boto3 to make sure we are on the latest boto3 that includes Analzye ID"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "id": "cc280ce2",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "!python -m pip install -q boto3 botocore --upgrade"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "id": "cd3d8238",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "import boto3\n",
 57 |     "import botocore\n",
 58 |     "from textractcaller import call_textract_analyzeid"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "id": "5cb62607",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "The sample drivers license image is located in an S3 bucket in us-east-2, so we pass in that region to the boto3 client"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 5,
 72 |    "id": "f85fc212",
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "textract_client = boto3.client('textract', region_name='us-east-2')\n",
 77 |     "j = call_textract_analyzeid(document_pages=[\"s3://amazon-textract-public-content/analyzeid/driverlicense.png\"], \n",
 78 |     "                            boto3_textract_client=textract_client)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "id": "ee7dc4e8",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "printing out the JSON response"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 6,
 92 |    "id": "d5417d43",
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stdout",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "{\n",
100 |       "  \"IdentityDocuments\": [\n",
101 |       "    {\n",
102 |       "      \"DocumentIndex\": 1,\n",
103 |       "      \"IdentityDocumentFields\": [\n",
104 |       "        {\n",
105 |       "          \"Type\": {\n",
106 |       "            \"Text\": \"FIRST_NAME\"\n",
107 |       "          },\n",
108 |       "          \"ValueDetection\": {\n",
109 |       "            \"Text\": \"JORGE\",\n",
110 |       "            \"Confidence\": 98.78211975097656\n",
111 |       "          }\n",
112 |       "        },\n",
113 |       "        {\n",
114 |       "          \"Type\": {\n",
115 |       "            \"Text\": \"LAST_NAME\"\n",
116 |       "          },\n",
117 |       "          \"ValueDetection\": {\n",
118 |       "            \"Text\": \"SOUZA\",\n",
119 |       "            \"Confidence\": 98.82009887695312\n",
120 |       "          }\n",
121 |       "        },\n",
122 |       "        {\n",
123 |       "          \"Type\": {\n",
124 |       "            \"Text\": \"MIDDLE_NAME\"\n",
125 |       "          },\n",
126 |       "          \"ValueDetection\": {\n",
127 |       "            \"Text\": \"\",\n",
128 |       "            \"Confidence\": 99.39620208740234\n",
129 |       "          }\n",
130 |       "        },\n",
131 |       "        {\n",
132 |       "          \"Type\": {\n",
133 |       "            \"Text\": \"SUFFIX\"\n",
134 |       "          },\n",
135 |       "          \"ValueDetection\": {\n",
136 |       "            \"Text\": \"\",\n",
137 |       "            \"Confidence\": 99.65946960449219\n",
138 |       "          }\n",
139 |       "        },\n",
140 |       "        {\n",
141 |       "          \"Type\": {\n",
142 |       "            \"Text\": \"CITY_IN_ADDRESS\"\n",
143 |       "          },\n",
144 |       "          \"ValueDetection\": {\n",
145 |       "            \"Text\": \"ANYTOWN\",\n",
146 |       "            \"Confidence\": 98.8210220336914\n",
147 |       "          }\n",
148 |       "        },\n",
149 |       "        {\n",
150 |       "          \"Type\": {\n",
151 |       "            \"Text\": \"ZIP_CODE_IN_ADDRESS\"\n",
152 |       "          },\n",
153 |       "          \"ValueDetection\": {\n",
154 |       "            \"Text\": \"02127\",\n",
155 |       "            \"Confidence\": 99.0246353149414\n",
156 |       "          }\n",
157 |       "        },\n",
158 |       "        {\n",
159 |       "          \"Type\": {\n",
160 |       "            \"Text\": \"STATE_IN_ADDRESS\"\n",
161 |       "          },\n",
162 |       "          \"ValueDetection\": {\n",
163 |       "            \"Text\": \"MA\",\n",
164 |       "            \"Confidence\": 99.53130340576172\n",
165 |       "          }\n",
166 |       "        },\n",
167 |       "        {\n",
168 |       "          \"Type\": {\n",
169 |       "            \"Text\": \"STATE_NAME\"\n",
170 |       "          },\n",
171 |       "          \"ValueDetection\": {\n",
172 |       "            \"Text\": \"MASSACHUSETTS\",\n",
173 |       "            \"Confidence\": 98.22105407714844\n",
174 |       "          }\n",
175 |       "        },\n",
176 |       "        {\n",
177 |       "          \"Type\": {\n",
178 |       "            \"Text\": \"DOCUMENT_NUMBER\"\n",
179 |       "          },\n",
180 |       "          \"ValueDetection\": {\n",
181 |       "            \"Text\": \"820BAC729CBAC\",\n",
182 |       "            \"Confidence\": 96.05117797851562\n",
183 |       "          }\n",
184 |       "        },\n",
185 |       "        {\n",
186 |       "          \"Type\": {\n",
187 |       "            \"Text\": \"EXPIRATION_DATE\"\n",
188 |       "          },\n",
189 |       "          \"ValueDetection\": {\n",
190 |       "            \"Text\": \"01/20/2020\",\n",
191 |       "            \"NormalizedValue\": {\n",
192 |       "              \"Value\": \"2020-01-20T00:00:00\",\n",
193 |       "              \"ValueType\": \"Date\"\n",
194 |       "            },\n",
195 |       "            \"Confidence\": 98.38336944580078\n",
196 |       "          }\n",
197 |       "        },\n",
198 |       "        {\n",
199 |       "          \"Type\": {\n",
200 |       "            \"Text\": \"DATE_OF_BIRTH\"\n",
201 |       "          },\n",
202 |       "          \"ValueDetection\": {\n",
203 |       "            \"Text\": \"03/18/1978\",\n",
204 |       "            \"NormalizedValue\": {\n",
205 |       "              \"Value\": \"1978-03-18T00:00:00\",\n",
206 |       "              \"ValueType\": \"Date\"\n",
207 |       "            },\n",
208 |       "            \"Confidence\": 98.17178344726562\n",
209 |       "          }\n",
210 |       "        },\n",
211 |       "        {\n",
212 |       "          \"Type\": {\n",
213 |       "            \"Text\": \"DATE_OF_ISSUE\"\n",
214 |       "          },\n",
215 |       "          \"ValueDetection\": {\n",
216 |       "            \"Text\": \"\",\n",
217 |       "            \"Confidence\": 89.29450988769531\n",
218 |       "          }\n",
219 |       "        },\n",
220 |       "        {\n",
221 |       "          \"Type\": {\n",
222 |       "            \"Text\": \"ID_TYPE\"\n",
223 |       "          },\n",
224 |       "          \"ValueDetection\": {\n",
225 |       "            \"Text\": \"DRIVER LICENSE FRONT\",\n",
226 |       "            \"Confidence\": 98.81443786621094\n",
227 |       "          }\n",
228 |       "        },\n",
229 |       "        {\n",
230 |       "          \"Type\": {\n",
231 |       "            \"Text\": \"ENDORSEMENTS\"\n",
232 |       "          },\n",
233 |       "          \"ValueDetection\": {\n",
234 |       "            \"Text\": \"NONE\",\n",
235 |       "            \"Confidence\": 99.27168273925781\n",
236 |       "          }\n",
237 |       "        },\n",
238 |       "        {\n",
239 |       "          \"Type\": {\n",
240 |       "            \"Text\": \"VETERAN\"\n",
241 |       "          },\n",
242 |       "          \"ValueDetection\": {\n",
243 |       "            \"Text\": \"\",\n",
244 |       "            \"Confidence\": 99.62979125976562\n",
245 |       "          }\n",
246 |       "        },\n",
247 |       "        {\n",
248 |       "          \"Type\": {\n",
249 |       "            \"Text\": \"RESTRICTIONS\"\n",
250 |       "          },\n",
251 |       "          \"ValueDetection\": {\n",
252 |       "            \"Text\": \"NONE\",\n",
253 |       "            \"Confidence\": 99.41033935546875\n",
254 |       "          }\n",
255 |       "        },\n",
256 |       "        {\n",
257 |       "          \"Type\": {\n",
258 |       "            \"Text\": \"CLASS\"\n",
259 |       "          },\n",
260 |       "          \"ValueDetection\": {\n",
261 |       "            \"Text\": \"D\",\n",
262 |       "            \"Confidence\": 99.05763244628906\n",
263 |       "          }\n",
264 |       "        },\n",
265 |       "        {\n",
266 |       "          \"Type\": {\n",
267 |       "            \"Text\": \"ADDRESS\"\n",
268 |       "          },\n",
269 |       "          \"ValueDetection\": {\n",
270 |       "            \"Text\": \"100 MAIN STREET\",\n",
271 |       "            \"Confidence\": 99.24053192138672\n",
272 |       "          }\n",
273 |       "        },\n",
274 |       "        {\n",
275 |       "          \"Type\": {\n",
276 |       "            \"Text\": \"COUNTY\"\n",
277 |       "          },\n",
278 |       "          \"ValueDetection\": {\n",
279 |       "            \"Text\": \"\",\n",
280 |       "            \"Confidence\": 99.59503173828125\n",
281 |       "          }\n",
282 |       "        },\n",
283 |       "        {\n",
284 |       "          \"Type\": {\n",
285 |       "            \"Text\": \"PLACE_OF_BIRTH\"\n",
286 |       "          },\n",
287 |       "          \"ValueDetection\": {\n",
288 |       "            \"Text\": \"\",\n",
289 |       "            \"Confidence\": 99.64707946777344\n",
290 |       "          }\n",
291 |       "        }\n",
292 |       "      ]\n",
293 |       "    }\n",
294 |       "  ],\n",
295 |       "  \"DocumentMetadata\": {\n",
296 |       "    \"Pages\": 1\n",
297 |       "  },\n",
298 |       "  \"AnalyzeIDModelVersion\": \"1.0\",\n",
299 |       "  \"ResponseMetadata\": {\n",
300 |       "    \"RequestId\": \"e7437df8-5c35-47a3-a24d-ee8436f18d1d\",\n",
301 |       "    \"HTTPStatusCode\": 200,\n",
302 |       "    \"HTTPHeaders\": {\n",
303 |       "      \"x-amzn-requestid\": \"e7437df8-5c35-47a3-a24d-ee8436f18d1d\",\n",
304 |       "      \"content-type\": \"application/x-amz-json-1.1\",\n",
305 |       "      \"content-length\": \"2223\",\n",
306 |       "      \"date\": \"Fri, 03 Dec 2021 18:56:24 GMT\"\n",
307 |       "    },\n",
308 |       "    \"RetryAttempts\": 0\n",
309 |       "  }\n",
310 |       "}\n"
311 |      ]
312 |     }
313 |    ],
314 |    "source": [
315 |     "import json\n",
316 |     "print(json.dumps(j, indent=2))"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "id": "c2a00b42",
322 |    "metadata": {},
323 |    "source": [
324 |     "Textract Response Parser makes it easier to get values from the JSON response"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 7,
330 |    "id": "e8947a56",
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "!python -m pip install -q amazon-textract-response-parser tabulate --upgrade"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "id": "a2f8e820",
340 |    "metadata": {},
341 |    "source": [
342 |     "The get_values_as_list() function returns the values as a  list of list of str in the following format\n",
343 |     "[[\"doc_number\", \"type\", \"value\", \"confidence\", \"normalized_value\", \"normalized_value_type\"]]\n"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 11,
349 |    "id": "e4b8c205",
350 |    "metadata": {},
351 |    "outputs": [
352 |     {
353 |      "data": {
354 |       "text/plain": [
355 |        "[['1', 'FIRST_NAME', 'JORGE', '98.78211975097656', '', ''],\n",
356 |        " ['1', 'LAST_NAME', 'SOUZA', '98.82009887695312', '', ''],\n",
357 |        " ['1', 'MIDDLE_NAME', '', '99.39620208740234', '', ''],\n",
358 |        " ['1', 'SUFFIX', '', '99.65946960449219', '', ''],\n",
359 |        " ['1', 'CITY_IN_ADDRESS', 'ANYTOWN', '98.8210220336914', '', ''],\n",
360 |        " ['1', 'ZIP_CODE_IN_ADDRESS', '02127', '99.0246353149414', '', ''],\n",
361 |        " ['1', 'STATE_IN_ADDRESS', 'MA', '99.53130340576172', '', ''],\n",
362 |        " ['1', 'STATE_NAME', 'MASSACHUSETTS', '98.22105407714844', '', ''],\n",
363 |        " ['1', 'DOCUMENT_NUMBER', '820BAC729CBAC', '96.05117797851562', '', ''],\n",
364 |        " ['1',\n",
365 |        "  'EXPIRATION_DATE',\n",
366 |        "  '01/20/2020',\n",
367 |        "  '98.38336944580078',\n",
368 |        "  '2020-01-20T00:00:00',\n",
369 |        "  'Date'],\n",
370 |        " ['1',\n",
371 |        "  'DATE_OF_BIRTH',\n",
372 |        "  '03/18/1978',\n",
373 |        "  '98.17178344726562',\n",
374 |        "  '1978-03-18T00:00:00',\n",
375 |        "  'Date'],\n",
376 |        " ['1', 'DATE_OF_ISSUE', '', '89.29450988769531', '', ''],\n",
377 |        " ['1', 'ID_TYPE', 'DRIVER LICENSE FRONT', '98.81443786621094', '', ''],\n",
378 |        " ['1', 'ENDORSEMENTS', 'NONE', '99.27168273925781', '', ''],\n",
379 |        " ['1', 'VETERAN', '', '99.62979125976562', '', ''],\n",
380 |        " ['1', 'RESTRICTIONS', 'NONE', '99.41033935546875', '', ''],\n",
381 |        " ['1', 'CLASS', 'D', '99.05763244628906', '', ''],\n",
382 |        " ['1', 'ADDRESS', '100 MAIN STREET', '99.24053192138672', '', ''],\n",
383 |        " ['1', 'COUNTY', '', '99.59503173828125', '', ''],\n",
384 |        " ['1', 'PLACE_OF_BIRTH', '', '99.64707946777344', '', '']]"
385 |       ]
386 |      },
387 |      "execution_count": 11,
388 |      "metadata": {},
389 |      "output_type": "execute_result"
390 |     }
391 |    ],
392 |    "source": [
393 |     "import trp.trp2_analyzeid as t2id\n",
394 |     "\n",
395 |     "doc: t2id.TAnalyzeIdDocument = t2id.TAnalyzeIdDocumentSchema().load(j)\n",
396 |     "result = doc.get_values_as_list()\n",
397 |     "result"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "id": "cb963302",
403 |    "metadata": {},
404 |    "source": [
405 |     "using tablulate we get a pretty printed output"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 13,
411 |    "id": "6c4fdfef",
412 |    "metadata": {},
413 |    "outputs": [
414 |     {
415 |      "name": "stdout",
416 |      "output_type": "stream",
417 |      "text": [
418 |       "-------------------  --------------------\n",
419 |       "FIRST_NAME           JORGE\n",
420 |       "LAST_NAME            SOUZA\n",
421 |       "MIDDLE_NAME\n",
422 |       "SUFFIX\n",
423 |       "CITY_IN_ADDRESS      ANYTOWN\n",
424 |       "ZIP_CODE_IN_ADDRESS  02127\n",
425 |       "STATE_IN_ADDRESS     MA\n",
426 |       "STATE_NAME           MASSACHUSETTS\n",
427 |       "DOCUMENT_NUMBER      820BAC729CBAC\n",
428 |       "EXPIRATION_DATE      01/20/2020\n",
429 |       "DATE_OF_BIRTH        03/18/1978\n",
430 |       "DATE_OF_ISSUE\n",
431 |       "ID_TYPE              DRIVER LICENSE FRONT\n",
432 |       "ENDORSEMENTS         NONE\n",
433 |       "VETERAN\n",
434 |       "RESTRICTIONS         NONE\n",
435 |       "CLASS                D\n",
436 |       "ADDRESS              100 MAIN STREET\n",
437 |       "COUNTY\n",
438 |       "PLACE_OF_BIRTH\n",
439 |       "-------------------  --------------------\n"
440 |      ]
441 |     }
442 |    ],
443 |    "source": [
444 |     "from tabulate import tabulate\n",
445 |     "print(tabulate([x[1:3] for x in result]))"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "markdown",
450 |    "id": "2d09fc61",
451 |    "metadata": {},
452 |    "source": [
453 |     "Just getting the FIRST_NAME"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 14,
459 |    "id": "3730f49e",
460 |    "metadata": {},
461 |    "outputs": [
462 |     {
463 |      "data": {
464 |       "text/plain": [
465 |        "['JORGE']"
466 |       ]
467 |      },
468 |      "execution_count": 14,
469 |      "metadata": {},
470 |      "output_type": "execute_result"
471 |     }
472 |    ],
473 |    "source": [
474 |     "[x[2] for x in result if x[1]=='FIRST_NAME']"
475 |    ]
476 |   }
477 |  ],
478 |  "metadata": {
479 |   "kernelspec": {
480 |    "display_name": "Python 3 (ipykernel)",
481 |    "language": "python",
482 |    "name": "python3"
483 |   },
484 |   "language_info": {
485 |    "codemirror_mode": {
486 |     "name": "ipython",
487 |     "version": 3
488 |    },
489 |    "file_extension": ".py",
490 |    "mimetype": "text/x-python",
491 |    "name": "python",
492 |    "nbconvert_exporter": "python",
493 |    "pygments_lexer": "ipython3",
494 |    "version": "3.9.6"
495 |   }
496 |  },
497 |  "nbformat": 4,
498 |  "nbformat_minor": 5
499 | }
500 | 


--------------------------------------------------------------------------------
/python/Textract-MergeCell-Statement.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/Textract-MergeCell-Statement.pdf


--------------------------------------------------------------------------------
/python/Textract-Table-Merged-Cells-And-Headers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "4c2f249b",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "## Detecting Merged Cells And Headers on fictitious bank statement"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "5b16cdb3",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "We will be using the modules below:\n",
 17 |     "* amazon-textract-caller (https://pypi.org/project/amazon-textract-caller/) to invoke Amazon Textract API on our behalf\n",
 18 |     "* amazon-textract-response-parser (http://%28https//pypi.org/project/amazon-textract-response-parser/) to parse the response payload\n",
 19 |     "* amazoon-textract-prettyprinter (https://pypi.org/project/amazon-textract-prettyprinter/) to \"pretty-print\" tables"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "id": "a6e6c072",
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "!pip install boto3\n",
 30 |     "!pip install amazon-textract-caller\n",
 31 |     "!pip install amazon-textract-prettyprinter\n",
 32 |     "!pip install trp"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "2bd23c9d",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import boto3\n",
 43 |     "import json\n",
 44 |     "import pandas as pd\n",
 45 |     "from textractcaller import call_textract, Textract_Features\n",
 46 |     "from textractprettyprinter.t_pretty_print import Pretty_Print_Table_Format, Textract_Pretty_Print, get_string, get_tables_string\n",
 47 |     "from trp import Document\n",
 48 |     "from trp.trp2 import TDocument, TDocumentSchema\n",
 49 |     "from trp.t_pipeline import order_blocks_by_geo\n",
 50 |     "from IPython.display import display"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "id": "f40c07e6",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "Let's initialize the boto3 session and then invoke textract_caller to perform the document processing API call and collect the response back on our behalf."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "34479535",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "session = boto3.Session(profile_name='<your_profile_name>')\n",
 69 |     "documentName = \"s3://textract-table-merged-cells-data-sample/Textract-MergeCell-Statement.pdf\"\n",
 70 |     "textract_json = call_textract(input_document=documentName, features = [Textract_Features.TABLES])"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "id": "4dd698d3",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "Let's pretty-print the response payload. As you can see, by default the date is not populated across all rows."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "id": "3e0fb7fb",
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "print(get_string(textract_json=textract_json, output_type=[Textract_Pretty_Print.TABLES]))"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "id": "36cedb05",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "Now let's load the response into an ordered document and scan the statement's table."
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "id": "50846117",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "t_doc = TDocumentSchema().load(textract_json)\n",
107 |     "ordered_doc = order_blocks_by_geo(t_doc)\n",
108 |     "trp_doc = Document(TDocumentSchema().dump(ordered_doc))\n",
109 |     "\n",
110 |     "table_index = 1\n",
111 |     "dataframes = []\n",
112 |     "\n",
113 |     "def combine_headers(top_h, bottom_h):\n",
114 |     "    bottom_h[3] = top_h[2] + \" \" + bottom_h[3]\n",
115 |     "    bottom_h[4] = top_h[2] + \" \" + bottom_h[4]\n",
116 |     "\n",
117 |     "for page in trp_doc.pages:\n",
118 |     "    for table in page.tables:\n",
119 |     "        table_data = []\n",
120 |     "        headers = table.get_header_field_names()\n",
121 |     "        if(len(headers)>0):                                      #Let's retain the only table with headers\n",
122 |     "            print(\"Statememt headers: \"+ repr(headers))\n",
123 |     "            top_header= headers[0]\n",
124 |     "            bottom_header = headers[1]\n",
125 |     "            combine_headers(top_header, bottom_header)           #The statement has two headers. let's combine them\n",
126 |     "            for r, row in enumerate(table.rows_without_header):  #New Table attribute returning rows without headers\n",
127 |     "                table_data.append([])\n",
128 |     "                for c, cell in enumerate(row.cells):\n",
129 |     "                    table_data[r].append(cell.mergedText)        #New Cell attribute returning merged cells common values\n",
130 |     "            \n",
131 |     "            if len(table_data)>0:\n",
132 |     "                df = pd.DataFrame(table_data, columns=bottom_header)\n",
133 |     "\n",
134 |     "df"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "id": "77d05bcd",
140 |    "metadata": {},
141 |    "source": [
142 |     "Now we can even use multi level indexing and reproduce the table's initial structure."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "id": "e21b6df1",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "multi = df.set_index(['Date', 'Details'])\n",
153 |     "display(multi)"
154 |    ]
155 |   }
156 |  ],
157 |  "metadata": {
158 |   "kernelspec": {
159 |    "display_name": "Python 3 (ipykernel)",
160 |    "language": "python",
161 |    "name": "python3"
162 |   },
163 |   "language_info": {
164 |    "codemirror_mode": {
165 |     "name": "ipython",
166 |     "version": 3
167 |    },
168 |    "file_extension": ".py",
169 |    "mimetype": "text/x-python",
170 |    "name": "python",
171 |    "nbconvert_exporter": "python",
172 |    "pygments_lexer": "ipython3",
173 |    "version": "3.9.7"
174 |   }
175 |  },
176 |  "nbformat": 4,
177 |  "nbformat_minor": 5
178 | }
179 | 


--------------------------------------------------------------------------------
/python/custom-queries/custom-queries-checks-blog.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "80051b34-8b9b-4b00-845c-c67ed070dc91",
   6 |    "metadata": {},
   7 |    "source": [
   8 |     "# Customer Queries Launch Blogpost: Checks processing\n",
   9 |     "This notebook will walk you through how to annotate and train Custom Queries.\n",
  10 |     "1. Option 1: Creating an adapter via the console  \n",
  11 |     "   This walkthrough covers the process of creating an adapter and then copying pre-annotated check samples to fast-track your testing.\n",
  12 |     "2. Option 2: Creating an adapter programmtically via the API  \n",
  13 |     "   This is identical to option 1, however, uses python boto3 to programmatically create a Custom Adapter and use it for testing."
  14 |    ]
  15 |   },
  16 |   {
  17 |    "cell_type": "markdown",
  18 |    "id": "0a4076bf-f2f9-40de-83d8-2b2b46d60eb8",
  19 |    "metadata": {
  20 |     "tags": []
  21 |    },
  22 |    "source": [
  23 |     "## Option 1. Create an adapter via the console and copy pre-annotated check samples\n",
  24 |     "Refer to the [Custom Queries Tutorial](https://docs.aws.amazon.com/textract/latest/dg/textract-adapters-tutorial.html) if you want to upload your own documents and annotate them."
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "markdown",
  29 |    "id": "aa029227-ae38-4ffc-b43b-6597929e0355",
  30 |    "metadata": {},
  31 |    "source": [
  32 |     "### Step 1.1: Create an adapter via console\n",
  33 |     "Navigate to the Textract console &rarr; Click on the Custom Queries button located in the sidebar &rarr; Click the Create Adapter button\n",
  34 |     "<img src=\"./screenshots/checks-notebook-step1.png\"/>"
  35 |    ]
  36 |   },
  37 |   {
  38 |    "cell_type": "markdown",
  39 |    "id": "69db73b8-deea-4a69-8812-f91a6186da54",
  40 |    "metadata": {},
  41 |    "source": [
  42 |     "### Step 1.2: Copy the adapter ID and dataset S3 bucket location from Adapter Details page.\n",
  43 |     "<img src=\"./screenshots/checks-notebook-step2.png\"/>"
  44 |    ]
  45 |   },
  46 |   {
  47 |    "cell_type": "code",
  48 |    "execution_count": null,
  49 |    "id": "9c6a8f6d-6c4d-404c-9ba5-764f90bed7f3",
  50 |    "metadata": {
  51 |     "tags": []
  52 |    },
  53 |    "outputs": [],
  54 |    "source": [
  55 |     "adapter_id=\"111111111111\"\n",
  56 |     "dataset_s3_bucket=\"textract-adapters-us-east-1-1111\""
  57 |    ]
  58 |   },
  59 |   {
  60 |    "cell_type": "markdown",
  61 |    "id": "504bf278-28dc-4899-bfc2-68618a54a96e",
  62 |    "metadata": {},
  63 |    "source": [
  64 |     "### Step 1.3: Update the manifest file with the adapter details\n",
  65 |     "Run the below cell to programmatically extract the pre-annotations and update the manifest file with your adapter ID.\n",
  66 |     "You will see a new folder created named as your adapter Id."
  67 |    ]
  68 |   },
  69 |   {
  70 |    "cell_type": "code",
  71 |    "execution_count": null,
  72 |    "id": "75d97d06-042f-46a3-81ae-2b3357ebea1f",
  73 |    "metadata": {
  74 |     "tags": []
  75 |    },
  76 |    "outputs": [],
  77 |    "source": [
  78 |     "import shutil\n",
  79 |     "shutil.unpack_archive(\"./samples/checks-annotations.zip\", extract_dir=adapter_id)\n",
  80 |     "print(f\"Check samples archive extracted successfully to folder {adapter_id}\")\n",
  81 |     "\n",
  82 |     "!sed -i -e \"s/<s3-bucket-name>/$dataset_s3_bucket/g;s/<adapter-id>/$adapter_id/g\" \"./$adapter_id/checks-annotations/manifest.jsonl\"\n",
  83 |     "print(f\"Replaced all instances of the adapter ID with {adapter_id} and S3 BUCKET with {dataset_s3_bucket}\")"
  84 |    ]
  85 |   },
  86 |   {
  87 |    "cell_type": "markdown",
  88 |    "id": "38f46be5-68d1-42d0-9628-913ece3cd521",
  89 |    "metadata": {
  90 |     "tags": []
  91 |    },
  92 |    "source": [
  93 |     "### Step 1.4: Copy the pre-annotations to the data set location"
  94 |    ]
  95 |   },
  96 |   {
  97 |    "cell_type": "code",
  98 |    "execution_count": null,
  99 |    "id": "0a8c337c-32d1-4b41-8397-de541e5131c9",
 100 |    "metadata": {
 101 |     "tags": []
 102 |    },
 103 |    "outputs": [],
 104 |    "source": [
 105 |     "!aws s3 cp \"./$adapter_id/checks-annotations\" \"s3://$dataset_s3_bucket/adapters/$adapter_id\" --recursive\n",
 106 |     "print(\"\\nSuccessfully copied all files\")"
 107 |    ]
 108 |   },
 109 |   {
 110 |    "cell_type": "markdown",
 111 |    "id": "33991daf-71de-4cf2-9ea4-9bd23bf5aa92",
 112 |    "metadata": {
 113 |     "tags": []
 114 |    },
 115 |    "source": [
 116 |     "### Step 1.5: Refresh the adapter details page\n",
 117 |     "Return back to the Textract console and refresh the adapter details page. You should see the following\n",
 118 |     "1. The dataset is created successfully\n",
 119 |     "2. Queries have been created\n",
 120 |     "3. Documents have been verified\n",
 121 |     "\n",
 122 |     "Note: if you cannot see your adapter updated like the screenshot below, please check if the adapter ID and S3 bucket you entered in Step 2 is correct.\n",
 123 |     "\n",
 124 |     "<img src=\"./screenshots/checks-notebook-step5_1.png\"/>"
 125 |    ]
 126 |   },
 127 |   {
 128 |    "cell_type": "markdown",
 129 |    "id": "77694d2a-88b1-432c-a3c8-e2f2fcfabc56",
 130 |    "metadata": {},
 131 |    "source": [
 132 |     "### Step 1.6: View the pre-annotated samples \n",
 133 |     "Click on the Verify Documents button to open the dataset page. Once open, select the files and click review annotations.\n",
 134 |     "<img src=\"./screenshots/checks-notebook-step6.png\"/>"
 135 |    ]
 136 |   },
 137 |   {
 138 |    "cell_type": "markdown",
 139 |    "id": "2edde5a3-6254-475f-8354-c6055f2adccd",
 140 |    "metadata": {},
 141 |    "source": [
 142 |     "### Step 1.7: Train the Adapter\n",
 143 |     "Click on the Train Adapter button to initiate training. Training can take 1 to 30 hours to complete, however, given our dataset is small, it should complete in an hour or so.\n",
 144 |     "<img src=\"./screenshots/checks-notebook-step7.png\"/>"
 145 |    ]
 146 |   },
 147 |   {
 148 |    "cell_type": "markdown",
 149 |    "id": "34faecaf-0568-428d-b18f-fa0ab51f3298",
 150 |    "metadata": {},
 151 |    "source": [
 152 |     "### Step 1.8: Evaluate the adapter (console)\n",
 153 |     "Once the training completes, click the Evaluate Adapter button on the Adapter Details page to review the adapter performance metrics.  \n",
 154 |     "You can also test samples in the console by clicking on the Try Adapter button and uploading a sample document.\n",
 155 |     "<img src=\"./screenshots/checks-notebook-step8.png\"/>"
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "markdown",
 160 |    "id": "a7eed7da-6134-47f8-a0e7-ec83c3573e39",
 161 |    "metadata": {},
 162 |    "source": [
 163 |     "### Step 1.9: Test the adapter programmatically (API)"
 164 |    ]
 165 |   },
 166 |   {
 167 |    "cell_type": "code",
 168 |    "execution_count": null,
 169 |    "id": "bd77ef99-512e-4d0f-9a9a-5790f1b41ec7",
 170 |    "metadata": {
 171 |     "tags": []
 172 |    },
 173 |    "outputs": [],
 174 |    "source": [
 175 |     "from IPython.display import Image\n",
 176 |     "\n",
 177 |     "document_name = f\"{adapter_id}/checks-annotations/original_assets/31eb3f65-babd-4410-b9ea-596c7b35989d.jpg\"\n",
 178 |     "Image(filename=document_name) "
 179 |    ]
 180 |   },
 181 |   {
 182 |    "cell_type": "code",
 183 |    "execution_count": null,
 184 |    "id": "a215913b-8647-47a9-bd29-c756c372f2b3",
 185 |    "metadata": {
 186 |     "tags": []
 187 |    },
 188 |    "outputs": [],
 189 |    "source": [
 190 |     "!python -m pip install amazon-textract-caller --upgrade\n",
 191 |     "!python -m pip install amazon-textract-response-parser --upgrade"
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "code",
 196 |    "execution_count": null,
 197 |    "id": "00c7d85d-11fb-4178-9114-319a922f2f1d",
 198 |    "metadata": {
 199 |     "tags": []
 200 |    },
 201 |    "outputs": [],
 202 |    "source": [
 203 |     "import boto3\n",
 204 |     "from textractcaller.t_call import call_textract, Textract_Features, Query, QueriesConfig, Adapter, AdaptersConfig\n",
 205 |     "import trp.trp2 as t2\n",
 206 |     "import pandas as pd\n",
 207 |     "\n",
 208 |     "textract_client = boto3.client('textract')\n",
 209 |     "\n",
 210 |     "def tabulate_query_answers(textract_json):\n",
 211 |     "    d = t2.TDocumentSchema().load(textract_json)\n",
 212 |     "    for page in d.pages:\n",
 213 |     "        query_answers = d.get_query_answers(page=page)\n",
 214 |     "        display(pd.DataFrame(query_answers))\n",
 215 |     "\n",
 216 |     "queries = []\n",
 217 |     "queries.append(Query(text=\"What is the check#?\", alias=\"CHECK_NUMBER\", pages=[\"*\"]))\n",
 218 |     "queries.append(Query(text=\"What is the date?\", alias=\"DATE\", pages=[\"*\"]))\n",
 219 |     "queries.append(Query(text=\"What is the check amount in words?\", alias=\"CHECK_AMOUNT_WORDS\", pages=[\"*\"]))\n",
 220 |     "queries.append(Query(text=\"What is the dollar amount?\", alias=\"DOLLAR_AMOUNT\", pages=[\"*\"]))\n",
 221 |     "queries.append(Query(text=\"Who is the payee?\", alias=\"PAYEE_NAME\", pages=[\"*\"]))\n",
 222 |     "queries.append(Query(text=\"What is the customer account#\", alias=\"ACCOUNT_NUMBER\", pages=[\"*\"]))\n",
 223 |     "queries.append(Query(text=\"what is the payee address?\", alias=\"PAYEE_ADDRESS\", pages=[\"*\"]))\n",
 224 |     "queries.append(Query(text=\"What is the bank routing number?\", alias=\"BANK_ROUTING_NUMBER\", pages=[\"*\"]))\n",
 225 |     "queries.append(Query(text=\"What is the memo\", alias=\"MEMO\", pages=[\"*\"]))\n",
 226 |     "queries.append(Query(text=\"What is the account name/payer/drawer name?\", alias=\"ACCOUNT_NAME\", pages=[\"*\"]))\n",
 227 |     "queries.append(Query(text=\"What is the bank name/drawee name?\", alias=\"BANK_NAME\", pages=[\"*\"]))\n",
 228 |     "queries.append(Query(text=\"What is the MICR line?\", alias=\"MICR_LINE\", pages=[\"*\"]))\n",
 229 |     "\n",
 230 |     "\n",
 231 |     "queries_config = QueriesConfig(queries=queries)"
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "code",
 236 |    "execution_count": null,
 237 |    "id": "6055c0fa-66ad-4923-9f51-1cedf53df6bf",
 238 |    "metadata": {
 239 |     "tags": []
 240 |    },
 241 |    "outputs": [],
 242 |    "source": [
 243 |     "print(\"Calling Pre-built Textract Queries\")\n",
 244 |     "\n",
 245 |     "textract_json_prebuilt = call_textract(input_document=document_name,\n",
 246 |     "                  boto3_textract_client=textract_client,\n",
 247 |     "                  features=[Textract_Features.QUERIES],\n",
 248 |     "                  queries_config=queries_config)\n",
 249 |     "\n",
 250 |     "tabulate_query_answers(textract_json_prebuilt)"
 251 |    ]
 252 |   },
 253 |   {
 254 |    "cell_type": "code",
 255 |    "execution_count": null,
 256 |    "id": "4bb64fd4-35f5-43a5-bba6-c98912de6d90",
 257 |    "metadata": {
 258 |     "tags": []
 259 |    },
 260 |    "outputs": [],
 261 |    "source": [
 262 |     "adapter1 = Adapter(adapter_id=adapter_id, version=\"1\", pages=[\"*\"])\n",
 263 |     "adapters_config = AdaptersConfig(adapters=[adapter1])\n",
 264 |     "print(f\"Calling Custom Queries with Adapter:{adapter_id}\")\n",
 265 |     "\n",
 266 |     "textract_json_with_adapter = call_textract(input_document=document_name,\n",
 267 |     "                  boto3_textract_client=textract_client,\n",
 268 |     "                  features=[Textract_Features.QUERIES],\n",
 269 |     "                  queries_config=queries_config,\n",
 270 |     "                  adapters_config=adapters_config)\n",
 271 |     "\n",
 272 |     "tabulate_query_answers(textract_json_with_adapter)"
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "markdown",
 277 |    "id": "11215483-730e-4b87-b926-9ff7422a72b8",
 278 |    "metadata": {},
 279 |    "source": [
 280 |     "## Option 2. Create an adapter programmatically via the API\n",
 281 |     "We use the Textract Boto3 client to create an adapter. See [Textract boto3 documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html) for details.  \n",
 282 |     "Alternately, you can use the CLI or a language of your choice. See \n",
 283 |     "[CLI Documentation](https://docs.aws.amazon.com/textract/latest/dg/textract-create-adapter.html) for details."
 284 |    ]
 285 |   },
 286 |   {
 287 |    "cell_type": "markdown",
 288 |    "id": "0b78fde3-c325-4c07-a41c-c409f47b91aa",
 289 |    "metadata": {},
 290 |    "source": [
 291 |     "### Step 2.1: Create an adapter using the CreateAdapter API\n",
 292 |     "1. On calling the CreateAdapter API, the API returns the created AdapterId. We will use this ID in subsequent steps.  \n",
 293 |     "2. We will also use the ListAdapter API to view all the adapters on the AWS Account"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": null,
 299 |    "id": "5fe0aac8-6f4b-41d8-aba5-4ae67d8ad9ca",
 300 |    "metadata": {},
 301 |    "outputs": [],
 302 |    "source": [
 303 |     "import boto3\n",
 304 |     "import pandas as pd\n",
 305 |     "from IPython.display import display, HTML \n",
 306 |     "\n",
 307 |     "textract_client = boto3.client('textract')\n",
 308 |     "\n",
 309 |     "response = textract_client.create_adapter(\n",
 310 |     "    AdapterName='checks-adapter-api',\n",
 311 |     "    Description='Adapter for checks processing created via the API',\n",
 312 |     "    FeatureTypes=['QUERIES'],\n",
 313 |     "    AutoUpdate='ENABLED',\n",
 314 |     "    Tags={\n",
 315 |     "        'project': 'checks-automation'\n",
 316 |     "    }\n",
 317 |     ")\n",
 318 |     "\n",
 319 |     "adapter_id = response[\"AdapterId\"]\n",
 320 |     "print(f\"Adapter created with adapter id: {adapter_id}\")"
 321 |    ]
 322 |   },
 323 |   {
 324 |    "cell_type": "code",
 325 |    "execution_count": null,
 326 |    "id": "16dd16ab-9f5f-46e1-aeee-5c2fc80b824f",
 327 |    "metadata": {
 328 |     "tags": []
 329 |    },
 330 |    "outputs": [],
 331 |    "source": [
 332 |     "response = textract_client.list_adapters()\n",
 333 |     "display(pd.DataFrame(response[\"Adapters\"]))"
 334 |    ]
 335 |   },
 336 |   {
 337 |    "cell_type": "markdown",
 338 |    "id": "9189f533-0fa3-4517-a733-48b8757589d9",
 339 |    "metadata": {},
 340 |    "source": [
 341 |     "### Step 2.2: Update and copy the document samples, manifest file and annotations to S3\n",
 342 |     "1. Provide the S3 bucket where you would like to store the test and train datasets  \n",
 343 |     "2. We copy the manifest file, annotations and samples to the bucket with the newly created Adapter ID as the alias."
 344 |    ]
 345 |   },
 346 |   {
 347 |    "cell_type": "code",
 348 |    "execution_count": null,
 349 |    "id": "105e8ee3-1587-4a79-ae46-2042a8d54ad4",
 350 |    "metadata": {
 351 |     "tags": []
 352 |    },
 353 |    "outputs": [],
 354 |    "source": [
 355 |     "dataset_s3_bucket = \"enter-s3-bucket\"\n",
 356 |     "\n",
 357 |     "# We use the same bucket for the output as the dataset bucket, with a different prefix. You can change this as required\n",
 358 |     "output_s3_bucket = dataset_s3_bucket"
 359 |    ]
 360 |   },
 361 |   {
 362 |    "cell_type": "code",
 363 |    "execution_count": null,
 364 |    "id": "9417c052-d188-4e16-8250-6dc2069407a4",
 365 |    "metadata": {
 366 |     "tags": []
 367 |    },
 368 |    "outputs": [],
 369 |    "source": [
 370 |     "import shutil\n",
 371 |     "shutil.unpack_archive(\"./samples/checks-annotations.zip\", extract_dir=adapter_id)\n",
 372 |     "print(f\"Check samples archive extracted successfully to folder {adapter_id}\")\n",
 373 |     "\n",
 374 |     "!sed -i -e \"s/<s3-bucket-name>/$dataset_s3_bucket/g;s/<adapter-id>/$adapter_id/g\" \"./$adapter_id/checks-annotations/manifest.jsonl\"\n",
 375 |     "print(f\"Replaced all instances of the adapter ID with {adapter_id} and S3 BUCKET with {dataset_s3_bucket}\")"
 376 |    ]
 377 |   },
 378 |   {
 379 |    "cell_type": "code",
 380 |    "execution_count": null,
 381 |    "id": "87575166-5842-4d34-8141-cba15d0f81be",
 382 |    "metadata": {
 383 |     "tags": []
 384 |    },
 385 |    "outputs": [],
 386 |    "source": [
 387 |     "!aws s3 cp \"./$adapter_id/checks-annotations\" \"s3://$dataset_s3_bucket/adapters/$adapter_id\" --recursive\n",
 388 |     "print(\"\\nSuccessfully copied all files\")"
 389 |    ]
 390 |   },
 391 |   {
 392 |    "cell_type": "markdown",
 393 |    "id": "ab09eaef-d7ca-44ad-9813-ac6f77179cd4",
 394 |    "metadata": {},
 395 |    "source": [
 396 |     "### Step 2.3: Begin training the Adapter by calling CreateAdapterVersion\n",
 397 |     "To begin training, we call the CreateAdapterVersion API"
 398 |    ]
 399 |   },
 400 |   {
 401 |    "cell_type": "code",
 402 |    "execution_count": null,
 403 |    "id": "daab8a79-2d4b-4686-8b7d-d5dac64637c0",
 404 |    "metadata": {},
 405 |    "outputs": [],
 406 |    "source": [
 407 |     "manifest_file_name=f\"adapters/{adapter_id}/manifest.jsonl\"\n",
 408 |     "output_config_prefix=f\"adapters-output/{adapter_id}/\"\n",
 409 |     "\n",
 410 |     "response = textract_client.create_adapter_version(\n",
 411 |     "    AdapterId=adapter_id,\n",
 412 |     "    DatasetConfig={\n",
 413 |     "        'ManifestS3Object': {\n",
 414 |     "            'Bucket': dataset_s3_bucket,\n",
 415 |     "            'Name': manifest_file_name\n",
 416 |     "        }\n",
 417 |     "    },\n",
 418 |     "    OutputConfig={\n",
 419 |     "        'S3Bucket': output_s3_bucket,\n",
 420 |     "        'S3Prefix': output_config_prefix\n",
 421 |     "    },\n",
 422 |     "    Tags={\n",
 423 |     "        'project': 'checks-automation'\n",
 424 |     "    }\n",
 425 |     ")\n",
 426 |     "\n",
 427 |     "adapter_version = response[\"AdapterVersion\"]\n",
 428 |     "print(f\"Started training AdapterVersion: {adapter_version} for AdapterId: {adapter_id}\")"
 429 |    ]
 430 |   },
 431 |   {
 432 |    "cell_type": "markdown",
 433 |    "id": "6eb56c5f-c3ea-4ce7-b920-29d3c49b7e3c",
 434 |    "metadata": {},
 435 |    "source": [
 436 |     "### Step 2.4: List all the adapter versions in your AWS Account\n",
 437 |     "You will see a new Adapter ID and Version in the list with the Status as \"CREATION_IN_PROGRESS\".  \n",
 438 |     "Training can take 1 to 30 hours to complete, however, given our dataset is small, it should complete in an hour or so."
 439 |    ]
 440 |   },
 441 |   {
 442 |    "cell_type": "code",
 443 |    "execution_count": null,
 444 |    "id": "69734f55-7255-42de-bea0-b2736e95f6d3",
 445 |    "metadata": {
 446 |     "tags": []
 447 |    },
 448 |    "outputs": [],
 449 |    "source": [
 450 |     "response = textract_client.list_adapter_versions()\n",
 451 |     "display(pd.DataFrame(response[\"AdapterVersions\"]))"
 452 |    ]
 453 |   },
 454 |   {
 455 |    "cell_type": "markdown",
 456 |    "id": "ad80a635-9e86-420d-b839-8f8f1c398077",
 457 |    "metadata": {
 458 |     "tags": []
 459 |    },
 460 |    "source": [
 461 |     "### Step 2.5: View details of the adapter you just created using GetAdapterVersion\n",
 462 |     "1. This provides you with all the details for the adapter - from the dataset and output config to the evaluation metrics.  \n",
 463 |     "2. As the adapter creation and training is still in progress, you will not see **\"EvaluationMetrics\"** yet. Come back once the training is complete"
 464 |    ]
 465 |   },
 466 |   {
 467 |    "cell_type": "code",
 468 |    "execution_count": null,
 469 |    "id": "d380157e-3f43-4cea-aa6d-7e29883f2d10",
 470 |    "metadata": {
 471 |     "tags": []
 472 |    },
 473 |    "outputs": [],
 474 |    "source": [
 475 |     "import json\n",
 476 |     "response = textract_client.get_adapter_version(\n",
 477 |     "    AdapterId=adapter_id,\n",
 478 |     "    AdapterVersion=adapter_version\n",
 479 |     ")\n",
 480 |     "print(json.dumps(response, indent=4, default=str))"
 481 |    ]
 482 |   },
 483 |   {
 484 |    "cell_type": "markdown",
 485 |    "id": "79b615e1-4264-4c2b-b443-c0bcdc44be70",
 486 |    "metadata": {},
 487 |    "source": [
 488 |     "### Step 2.6: Test the adapter programmatically (API)"
 489 |    ]
 490 |   },
 491 |   {
 492 |    "cell_type": "code",
 493 |    "execution_count": null,
 494 |    "id": "ba97ad85-cd72-4886-aae7-e81ebf3eeb98",
 495 |    "metadata": {},
 496 |    "outputs": [],
 497 |    "source": [
 498 |     "from IPython.display import Image\n",
 499 |     "\n",
 500 |     "document_name = f\"{adapter_id}/checks-annotations/original_assets/31eb3f65-babd-4410-b9ea-596c7b35989d.jpg\"\n",
 501 |     "Image(filename=document_name) "
 502 |    ]
 503 |   },
 504 |   {
 505 |    "cell_type": "code",
 506 |    "execution_count": null,
 507 |    "id": "72ce03a5-9612-4138-b917-59406d0c66c0",
 508 |    "metadata": {},
 509 |    "outputs": [],
 510 |    "source": [
 511 |     "import boto3\n",
 512 |     "from textractcaller.t_call import call_textract, Textract_Features, Query, QueriesConfig, Adapter, AdaptersConfig\n",
 513 |     "import trp.trp2 as t2\n",
 514 |     "\n",
 515 |     "textract_client = boto3.client('textract')\n",
 516 |     "\n",
 517 |     "def tabulate_query_answers(textract_json):\n",
 518 |     "    d = t2.TDocumentSchema().load(textract_json)\n",
 519 |     "    for page in d.pages:\n",
 520 |     "        query_answers = d.get_query_answers(page=page)\n",
 521 |     "        display(pd.DataFrame(query_answers))\n",
 522 |     "\n",
 523 |     "queries = []\n",
 524 |     "queries.append(Query(text=\"What is the check#?\", alias=\"CHECK_NUMBER\", pages=[\"*\"]))\n",
 525 |     "queries.append(Query(text=\"What is the date?\", alias=\"DATE\", pages=[\"*\"]))\n",
 526 |     "queries.append(Query(text=\"What is the check amount in words?\", alias=\"CHECK_AMOUNT_WORDS\", pages=[\"*\"]))\n",
 527 |     "queries.append(Query(text=\"What is the dollar amount?\", alias=\"DOLLAR_AMOUNT\", pages=[\"*\"]))\n",
 528 |     "queries.append(Query(text=\"Who is the payee?\", alias=\"PAYEE_NAME\", pages=[\"*\"]))\n",
 529 |     "queries.append(Query(text=\"What is the customer account#\", alias=\"ACCOUNT_NUMBER\", pages=[\"*\"]))\n",
 530 |     "queries.append(Query(text=\"what is the payee address?\", alias=\"PAYEE_ADDRESS\", pages=[\"*\"]))\n",
 531 |     "queries.append(Query(text=\"What is the bank routing number?\", alias=\"BANK_ROUTING_NUMBER\", pages=[\"*\"]))\n",
 532 |     "queries.append(Query(text=\"What is the memo\", alias=\"MEMO\", pages=[\"*\"]))\n",
 533 |     "queries.append(Query(text=\"What is the account name/payer/drawer name?\", alias=\"ACCOUNT_NAME\", pages=[\"*\"]))\n",
 534 |     "queries.append(Query(text=\"What is the bank name/drawee name?\", alias=\"BANK_NAME\", pages=[\"*\"]))\n",
 535 |     "queries.append(Query(text=\"What is the MICR line?\", alias=\"MICR_LINE\", pages=[\"*\"]))\n",
 536 |     "\n",
 537 |     "\n",
 538 |     "queries_config = QueriesConfig(queries=queries)"
 539 |    ]
 540 |   },
 541 |   {
 542 |    "cell_type": "code",
 543 |    "execution_count": null,
 544 |    "id": "44d2af5b-57af-4f82-af0b-9efcb4a05767",
 545 |    "metadata": {},
 546 |    "outputs": [],
 547 |    "source": [
 548 |     "print(\"Calling Pre-built Textract Queries\")\n",
 549 |     "\n",
 550 |     "textract_json_prebuilt = call_textract(input_document=document_name,\n",
 551 |     "                  boto3_textract_client=textract_client,\n",
 552 |     "                  features=[Textract_Features.QUERIES],\n",
 553 |     "                  queries_config=queries_config)\n",
 554 |     "\n",
 555 |     "tabulate_query_answers(textract_json_prebuilt)"
 556 |    ]
 557 |   },
 558 |   {
 559 |    "cell_type": "code",
 560 |    "execution_count": null,
 561 |    "id": "217106a3-15e8-4913-a8a9-ddc7307290c7",
 562 |    "metadata": {
 563 |     "tags": []
 564 |    },
 565 |    "outputs": [],
 566 |    "source": [
 567 |     "adapter1 = Adapter(adapter_id=adapter_id, version=adapter_version, pages=[\"*\"])\n",
 568 |     "adapters_config = AdaptersConfig(adapters=[adapter1])\n",
 569 |     "print(f\"Calling Custom Queries with Adapter: {adapter_id} and AdapterVersion: {adapter_version}\")\n",
 570 |     "\n",
 571 |     "textract_json_with_adapter = call_textract(input_document=document_name,\n",
 572 |     "                  boto3_textract_client=textract_client,\n",
 573 |     "                  features=[Textract_Features.QUERIES],\n",
 574 |     "                  queries_config=queries_config,\n",
 575 |     "                  adapters_config=adapters_config)\n",
 576 |     "\n",
 577 |     "tabulate_query_answers(textract_json_with_adapter)"
 578 |    ]
 579 |   },
 580 |   {
 581 |    "cell_type": "markdown",
 582 |    "id": "4e253825-5e92-4ee9-bda3-4cd005247851",
 583 |    "metadata": {},
 584 |    "source": [
 585 |     "### Step 2.7 : Clean-up resources\n",
 586 |     "You can choose to delete the adapter version or the adapter.  \n",
 587 |     "When deleting the entire adapter, you must delete all adapter versions first and then proceed to delete the adapter.  "
 588 |    ]
 589 |   },
 590 |   {
 591 |    "cell_type": "code",
 592 |    "execution_count": null,
 593 |    "id": "08ae063e-b992-4816-b475-d39a8595dc65",
 594 |    "metadata": {
 595 |     "tags": []
 596 |    },
 597 |    "outputs": [],
 598 |    "source": [
 599 |     "response = textract_client.delete_adapter_version(\n",
 600 |     "    AdapterId=adapter_id,\n",
 601 |     "    AdapterVersion=adapter_version\n",
 602 |     ")\n",
 603 |     "if response[\"ResponseMetadata\"][\"HTTPStatusCode\"] == 200:\n",
 604 |     "    print(f\"Adapter Version: {adapter_version} successfully deleted\")\n",
 605 |     "\n",
 606 |     "\n",
 607 |     "response = textract_client.delete_adapter(\n",
 608 |     "    AdapterId=adapter_id\n",
 609 |     ")\n",
 610 |     "if response[\"ResponseMetadata\"][\"HTTPStatusCode\"] == 200:\n",
 611 |     "    print(f\"Adapter ID: {adapter_id} successfully deleted\")\n"
 612 |    ]
 613 |   }
 614 |  ],
 615 |  "metadata": {
 616 |   "availableInstances": [
 617 |    {
 618 |     "_defaultOrder": 0,
 619 |     "_isFastLaunch": true,
 620 |     "category": "General purpose",
 621 |     "gpuNum": 0,
 622 |     "hideHardwareSpecs": false,
 623 |     "memoryGiB": 4,
 624 |     "name": "ml.t3.medium",
 625 |     "vcpuNum": 2
 626 |    },
 627 |    {
 628 |     "_defaultOrder": 1,
 629 |     "_isFastLaunch": false,
 630 |     "category": "General purpose",
 631 |     "gpuNum": 0,
 632 |     "hideHardwareSpecs": false,
 633 |     "memoryGiB": 8,
 634 |     "name": "ml.t3.large",
 635 |     "vcpuNum": 2
 636 |    },
 637 |    {
 638 |     "_defaultOrder": 2,
 639 |     "_isFastLaunch": false,
 640 |     "category": "General purpose",
 641 |     "gpuNum": 0,
 642 |     "hideHardwareSpecs": false,
 643 |     "memoryGiB": 16,
 644 |     "name": "ml.t3.xlarge",
 645 |     "vcpuNum": 4
 646 |    },
 647 |    {
 648 |     "_defaultOrder": 3,
 649 |     "_isFastLaunch": false,
 650 |     "category": "General purpose",
 651 |     "gpuNum": 0,
 652 |     "hideHardwareSpecs": false,
 653 |     "memoryGiB": 32,
 654 |     "name": "ml.t3.2xlarge",
 655 |     "vcpuNum": 8
 656 |    },
 657 |    {
 658 |     "_defaultOrder": 4,
 659 |     "_isFastLaunch": true,
 660 |     "category": "General purpose",
 661 |     "gpuNum": 0,
 662 |     "hideHardwareSpecs": false,
 663 |     "memoryGiB": 8,
 664 |     "name": "ml.m5.large",
 665 |     "vcpuNum": 2
 666 |    },
 667 |    {
 668 |     "_defaultOrder": 5,
 669 |     "_isFastLaunch": false,
 670 |     "category": "General purpose",
 671 |     "gpuNum": 0,
 672 |     "hideHardwareSpecs": false,
 673 |     "memoryGiB": 16,
 674 |     "name": "ml.m5.xlarge",
 675 |     "vcpuNum": 4
 676 |    },
 677 |    {
 678 |     "_defaultOrder": 6,
 679 |     "_isFastLaunch": false,
 680 |     "category": "General purpose",
 681 |     "gpuNum": 0,
 682 |     "hideHardwareSpecs": false,
 683 |     "memoryGiB": 32,
 684 |     "name": "ml.m5.2xlarge",
 685 |     "vcpuNum": 8
 686 |    },
 687 |    {
 688 |     "_defaultOrder": 7,
 689 |     "_isFastLaunch": false,
 690 |     "category": "General purpose",
 691 |     "gpuNum": 0,
 692 |     "hideHardwareSpecs": false,
 693 |     "memoryGiB": 64,
 694 |     "name": "ml.m5.4xlarge",
 695 |     "vcpuNum": 16
 696 |    },
 697 |    {
 698 |     "_defaultOrder": 8,
 699 |     "_isFastLaunch": false,
 700 |     "category": "General purpose",
 701 |     "gpuNum": 0,
 702 |     "hideHardwareSpecs": false,
 703 |     "memoryGiB": 128,
 704 |     "name": "ml.m5.8xlarge",
 705 |     "vcpuNum": 32
 706 |    },
 707 |    {
 708 |     "_defaultOrder": 9,
 709 |     "_isFastLaunch": false,
 710 |     "category": "General purpose",
 711 |     "gpuNum": 0,
 712 |     "hideHardwareSpecs": false,
 713 |     "memoryGiB": 192,
 714 |     "name": "ml.m5.12xlarge",
 715 |     "vcpuNum": 48
 716 |    },
 717 |    {
 718 |     "_defaultOrder": 10,
 719 |     "_isFastLaunch": false,
 720 |     "category": "General purpose",
 721 |     "gpuNum": 0,
 722 |     "hideHardwareSpecs": false,
 723 |     "memoryGiB": 256,
 724 |     "name": "ml.m5.16xlarge",
 725 |     "vcpuNum": 64
 726 |    },
 727 |    {
 728 |     "_defaultOrder": 11,
 729 |     "_isFastLaunch": false,
 730 |     "category": "General purpose",
 731 |     "gpuNum": 0,
 732 |     "hideHardwareSpecs": false,
 733 |     "memoryGiB": 384,
 734 |     "name": "ml.m5.24xlarge",
 735 |     "vcpuNum": 96
 736 |    },
 737 |    {
 738 |     "_defaultOrder": 12,
 739 |     "_isFastLaunch": false,
 740 |     "category": "General purpose",
 741 |     "gpuNum": 0,
 742 |     "hideHardwareSpecs": false,
 743 |     "memoryGiB": 8,
 744 |     "name": "ml.m5d.large",
 745 |     "vcpuNum": 2
 746 |    },
 747 |    {
 748 |     "_defaultOrder": 13,
 749 |     "_isFastLaunch": false,
 750 |     "category": "General purpose",
 751 |     "gpuNum": 0,
 752 |     "hideHardwareSpecs": false,
 753 |     "memoryGiB": 16,
 754 |     "name": "ml.m5d.xlarge",
 755 |     "vcpuNum": 4
 756 |    },
 757 |    {
 758 |     "_defaultOrder": 14,
 759 |     "_isFastLaunch": false,
 760 |     "category": "General purpose",
 761 |     "gpuNum": 0,
 762 |     "hideHardwareSpecs": false,
 763 |     "memoryGiB": 32,
 764 |     "name": "ml.m5d.2xlarge",
 765 |     "vcpuNum": 8
 766 |    },
 767 |    {
 768 |     "_defaultOrder": 15,
 769 |     "_isFastLaunch": false,
 770 |     "category": "General purpose",
 771 |     "gpuNum": 0,
 772 |     "hideHardwareSpecs": false,
 773 |     "memoryGiB": 64,
 774 |     "name": "ml.m5d.4xlarge",
 775 |     "vcpuNum": 16
 776 |    },
 777 |    {
 778 |     "_defaultOrder": 16,
 779 |     "_isFastLaunch": false,
 780 |     "category": "General purpose",
 781 |     "gpuNum": 0,
 782 |     "hideHardwareSpecs": false,
 783 |     "memoryGiB": 128,
 784 |     "name": "ml.m5d.8xlarge",
 785 |     "vcpuNum": 32
 786 |    },
 787 |    {
 788 |     "_defaultOrder": 17,
 789 |     "_isFastLaunch": false,
 790 |     "category": "General purpose",
 791 |     "gpuNum": 0,
 792 |     "hideHardwareSpecs": false,
 793 |     "memoryGiB": 192,
 794 |     "name": "ml.m5d.12xlarge",
 795 |     "vcpuNum": 48
 796 |    },
 797 |    {
 798 |     "_defaultOrder": 18,
 799 |     "_isFastLaunch": false,
 800 |     "category": "General purpose",
 801 |     "gpuNum": 0,
 802 |     "hideHardwareSpecs": false,
 803 |     "memoryGiB": 256,
 804 |     "name": "ml.m5d.16xlarge",
 805 |     "vcpuNum": 64
 806 |    },
 807 |    {
 808 |     "_defaultOrder": 19,
 809 |     "_isFastLaunch": false,
 810 |     "category": "General purpose",
 811 |     "gpuNum": 0,
 812 |     "hideHardwareSpecs": false,
 813 |     "memoryGiB": 384,
 814 |     "name": "ml.m5d.24xlarge",
 815 |     "vcpuNum": 96
 816 |    },
 817 |    {
 818 |     "_defaultOrder": 20,
 819 |     "_isFastLaunch": false,
 820 |     "category": "General purpose",
 821 |     "gpuNum": 0,
 822 |     "hideHardwareSpecs": true,
 823 |     "memoryGiB": 0,
 824 |     "name": "ml.geospatial.interactive",
 825 |     "supportedImageNames": [
 826 |      "sagemaker-geospatial-v1-0"
 827 |     ],
 828 |     "vcpuNum": 0
 829 |    },
 830 |    {
 831 |     "_defaultOrder": 21,
 832 |     "_isFastLaunch": true,
 833 |     "category": "Compute optimized",
 834 |     "gpuNum": 0,
 835 |     "hideHardwareSpecs": false,
 836 |     "memoryGiB": 4,
 837 |     "name": "ml.c5.large",
 838 |     "vcpuNum": 2
 839 |    },
 840 |    {
 841 |     "_defaultOrder": 22,
 842 |     "_isFastLaunch": false,
 843 |     "category": "Compute optimized",
 844 |     "gpuNum": 0,
 845 |     "hideHardwareSpecs": false,
 846 |     "memoryGiB": 8,
 847 |     "name": "ml.c5.xlarge",
 848 |     "vcpuNum": 4
 849 |    },
 850 |    {
 851 |     "_defaultOrder": 23,
 852 |     "_isFastLaunch": false,
 853 |     "category": "Compute optimized",
 854 |     "gpuNum": 0,
 855 |     "hideHardwareSpecs": false,
 856 |     "memoryGiB": 16,
 857 |     "name": "ml.c5.2xlarge",
 858 |     "vcpuNum": 8
 859 |    },
 860 |    {
 861 |     "_defaultOrder": 24,
 862 |     "_isFastLaunch": false,
 863 |     "category": "Compute optimized",
 864 |     "gpuNum": 0,
 865 |     "hideHardwareSpecs": false,
 866 |     "memoryGiB": 32,
 867 |     "name": "ml.c5.4xlarge",
 868 |     "vcpuNum": 16
 869 |    },
 870 |    {
 871 |     "_defaultOrder": 25,
 872 |     "_isFastLaunch": false,
 873 |     "category": "Compute optimized",
 874 |     "gpuNum": 0,
 875 |     "hideHardwareSpecs": false,
 876 |     "memoryGiB": 72,
 877 |     "name": "ml.c5.9xlarge",
 878 |     "vcpuNum": 36
 879 |    },
 880 |    {
 881 |     "_defaultOrder": 26,
 882 |     "_isFastLaunch": false,
 883 |     "category": "Compute optimized",
 884 |     "gpuNum": 0,
 885 |     "hideHardwareSpecs": false,
 886 |     "memoryGiB": 96,
 887 |     "name": "ml.c5.12xlarge",
 888 |     "vcpuNum": 48
 889 |    },
 890 |    {
 891 |     "_defaultOrder": 27,
 892 |     "_isFastLaunch": false,
 893 |     "category": "Compute optimized",
 894 |     "gpuNum": 0,
 895 |     "hideHardwareSpecs": false,
 896 |     "memoryGiB": 144,
 897 |     "name": "ml.c5.18xlarge",
 898 |     "vcpuNum": 72
 899 |    },
 900 |    {
 901 |     "_defaultOrder": 28,
 902 |     "_isFastLaunch": false,
 903 |     "category": "Compute optimized",
 904 |     "gpuNum": 0,
 905 |     "hideHardwareSpecs": false,
 906 |     "memoryGiB": 192,
 907 |     "name": "ml.c5.24xlarge",
 908 |     "vcpuNum": 96
 909 |    },
 910 |    {
 911 |     "_defaultOrder": 29,
 912 |     "_isFastLaunch": true,
 913 |     "category": "Accelerated computing",
 914 |     "gpuNum": 1,
 915 |     "hideHardwareSpecs": false,
 916 |     "memoryGiB": 16,
 917 |     "name": "ml.g4dn.xlarge",
 918 |     "vcpuNum": 4
 919 |    },
 920 |    {
 921 |     "_defaultOrder": 30,
 922 |     "_isFastLaunch": false,
 923 |     "category": "Accelerated computing",
 924 |     "gpuNum": 1,
 925 |     "hideHardwareSpecs": false,
 926 |     "memoryGiB": 32,
 927 |     "name": "ml.g4dn.2xlarge",
 928 |     "vcpuNum": 8
 929 |    },
 930 |    {
 931 |     "_defaultOrder": 31,
 932 |     "_isFastLaunch": false,
 933 |     "category": "Accelerated computing",
 934 |     "gpuNum": 1,
 935 |     "hideHardwareSpecs": false,
 936 |     "memoryGiB": 64,
 937 |     "name": "ml.g4dn.4xlarge",
 938 |     "vcpuNum": 16
 939 |    },
 940 |    {
 941 |     "_defaultOrder": 32,
 942 |     "_isFastLaunch": false,
 943 |     "category": "Accelerated computing",
 944 |     "gpuNum": 1,
 945 |     "hideHardwareSpecs": false,
 946 |     "memoryGiB": 128,
 947 |     "name": "ml.g4dn.8xlarge",
 948 |     "vcpuNum": 32
 949 |    },
 950 |    {
 951 |     "_defaultOrder": 33,
 952 |     "_isFastLaunch": false,
 953 |     "category": "Accelerated computing",
 954 |     "gpuNum": 4,
 955 |     "hideHardwareSpecs": false,
 956 |     "memoryGiB": 192,
 957 |     "name": "ml.g4dn.12xlarge",
 958 |     "vcpuNum": 48
 959 |    },
 960 |    {
 961 |     "_defaultOrder": 34,
 962 |     "_isFastLaunch": false,
 963 |     "category": "Accelerated computing",
 964 |     "gpuNum": 1,
 965 |     "hideHardwareSpecs": false,
 966 |     "memoryGiB": 256,
 967 |     "name": "ml.g4dn.16xlarge",
 968 |     "vcpuNum": 64
 969 |    },
 970 |    {
 971 |     "_defaultOrder": 35,
 972 |     "_isFastLaunch": false,
 973 |     "category": "Accelerated computing",
 974 |     "gpuNum": 1,
 975 |     "hideHardwareSpecs": false,
 976 |     "memoryGiB": 61,
 977 |     "name": "ml.p3.2xlarge",
 978 |     "vcpuNum": 8
 979 |    },
 980 |    {
 981 |     "_defaultOrder": 36,
 982 |     "_isFastLaunch": false,
 983 |     "category": "Accelerated computing",
 984 |     "gpuNum": 4,
 985 |     "hideHardwareSpecs": false,
 986 |     "memoryGiB": 244,
 987 |     "name": "ml.p3.8xlarge",
 988 |     "vcpuNum": 32
 989 |    },
 990 |    {
 991 |     "_defaultOrder": 37,
 992 |     "_isFastLaunch": false,
 993 |     "category": "Accelerated computing",
 994 |     "gpuNum": 8,
 995 |     "hideHardwareSpecs": false,
 996 |     "memoryGiB": 488,
 997 |     "name": "ml.p3.16xlarge",
 998 |     "vcpuNum": 64
 999 |    },
1000 |    {
1001 |     "_defaultOrder": 38,
1002 |     "_isFastLaunch": false,
1003 |     "category": "Accelerated computing",
1004 |     "gpuNum": 8,
1005 |     "hideHardwareSpecs": false,
1006 |     "memoryGiB": 768,
1007 |     "name": "ml.p3dn.24xlarge",
1008 |     "vcpuNum": 96
1009 |    },
1010 |    {
1011 |     "_defaultOrder": 39,
1012 |     "_isFastLaunch": false,
1013 |     "category": "Memory Optimized",
1014 |     "gpuNum": 0,
1015 |     "hideHardwareSpecs": false,
1016 |     "memoryGiB": 16,
1017 |     "name": "ml.r5.large",
1018 |     "vcpuNum": 2
1019 |    },
1020 |    {
1021 |     "_defaultOrder": 40,
1022 |     "_isFastLaunch": false,
1023 |     "category": "Memory Optimized",
1024 |     "gpuNum": 0,
1025 |     "hideHardwareSpecs": false,
1026 |     "memoryGiB": 32,
1027 |     "name": "ml.r5.xlarge",
1028 |     "vcpuNum": 4
1029 |    },
1030 |    {
1031 |     "_defaultOrder": 41,
1032 |     "_isFastLaunch": false,
1033 |     "category": "Memory Optimized",
1034 |     "gpuNum": 0,
1035 |     "hideHardwareSpecs": false,
1036 |     "memoryGiB": 64,
1037 |     "name": "ml.r5.2xlarge",
1038 |     "vcpuNum": 8
1039 |    },
1040 |    {
1041 |     "_defaultOrder": 42,
1042 |     "_isFastLaunch": false,
1043 |     "category": "Memory Optimized",
1044 |     "gpuNum": 0,
1045 |     "hideHardwareSpecs": false,
1046 |     "memoryGiB": 128,
1047 |     "name": "ml.r5.4xlarge",
1048 |     "vcpuNum": 16
1049 |    },
1050 |    {
1051 |     "_defaultOrder": 43,
1052 |     "_isFastLaunch": false,
1053 |     "category": "Memory Optimized",
1054 |     "gpuNum": 0,
1055 |     "hideHardwareSpecs": false,
1056 |     "memoryGiB": 256,
1057 |     "name": "ml.r5.8xlarge",
1058 |     "vcpuNum": 32
1059 |    },
1060 |    {
1061 |     "_defaultOrder": 44,
1062 |     "_isFastLaunch": false,
1063 |     "category": "Memory Optimized",
1064 |     "gpuNum": 0,
1065 |     "hideHardwareSpecs": false,
1066 |     "memoryGiB": 384,
1067 |     "name": "ml.r5.12xlarge",
1068 |     "vcpuNum": 48
1069 |    },
1070 |    {
1071 |     "_defaultOrder": 45,
1072 |     "_isFastLaunch": false,
1073 |     "category": "Memory Optimized",
1074 |     "gpuNum": 0,
1075 |     "hideHardwareSpecs": false,
1076 |     "memoryGiB": 512,
1077 |     "name": "ml.r5.16xlarge",
1078 |     "vcpuNum": 64
1079 |    },
1080 |    {
1081 |     "_defaultOrder": 46,
1082 |     "_isFastLaunch": false,
1083 |     "category": "Memory Optimized",
1084 |     "gpuNum": 0,
1085 |     "hideHardwareSpecs": false,
1086 |     "memoryGiB": 768,
1087 |     "name": "ml.r5.24xlarge",
1088 |     "vcpuNum": 96
1089 |    },
1090 |    {
1091 |     "_defaultOrder": 47,
1092 |     "_isFastLaunch": false,
1093 |     "category": "Accelerated computing",
1094 |     "gpuNum": 1,
1095 |     "hideHardwareSpecs": false,
1096 |     "memoryGiB": 16,
1097 |     "name": "ml.g5.xlarge",
1098 |     "vcpuNum": 4
1099 |    },
1100 |    {
1101 |     "_defaultOrder": 48,
1102 |     "_isFastLaunch": false,
1103 |     "category": "Accelerated computing",
1104 |     "gpuNum": 1,
1105 |     "hideHardwareSpecs": false,
1106 |     "memoryGiB": 32,
1107 |     "name": "ml.g5.2xlarge",
1108 |     "vcpuNum": 8
1109 |    },
1110 |    {
1111 |     "_defaultOrder": 49,
1112 |     "_isFastLaunch": false,
1113 |     "category": "Accelerated computing",
1114 |     "gpuNum": 1,
1115 |     "hideHardwareSpecs": false,
1116 |     "memoryGiB": 64,
1117 |     "name": "ml.g5.4xlarge",
1118 |     "vcpuNum": 16
1119 |    },
1120 |    {
1121 |     "_defaultOrder": 50,
1122 |     "_isFastLaunch": false,
1123 |     "category": "Accelerated computing",
1124 |     "gpuNum": 1,
1125 |     "hideHardwareSpecs": false,
1126 |     "memoryGiB": 128,
1127 |     "name": "ml.g5.8xlarge",
1128 |     "vcpuNum": 32
1129 |    },
1130 |    {
1131 |     "_defaultOrder": 51,
1132 |     "_isFastLaunch": false,
1133 |     "category": "Accelerated computing",
1134 |     "gpuNum": 1,
1135 |     "hideHardwareSpecs": false,
1136 |     "memoryGiB": 256,
1137 |     "name": "ml.g5.16xlarge",
1138 |     "vcpuNum": 64
1139 |    },
1140 |    {
1141 |     "_defaultOrder": 52,
1142 |     "_isFastLaunch": false,
1143 |     "category": "Accelerated computing",
1144 |     "gpuNum": 4,
1145 |     "hideHardwareSpecs": false,
1146 |     "memoryGiB": 192,
1147 |     "name": "ml.g5.12xlarge",
1148 |     "vcpuNum": 48
1149 |    },
1150 |    {
1151 |     "_defaultOrder": 53,
1152 |     "_isFastLaunch": false,
1153 |     "category": "Accelerated computing",
1154 |     "gpuNum": 4,
1155 |     "hideHardwareSpecs": false,
1156 |     "memoryGiB": 384,
1157 |     "name": "ml.g5.24xlarge",
1158 |     "vcpuNum": 96
1159 |    },
1160 |    {
1161 |     "_defaultOrder": 54,
1162 |     "_isFastLaunch": false,
1163 |     "category": "Accelerated computing",
1164 |     "gpuNum": 8,
1165 |     "hideHardwareSpecs": false,
1166 |     "memoryGiB": 768,
1167 |     "name": "ml.g5.48xlarge",
1168 |     "vcpuNum": 192
1169 |    },
1170 |    {
1171 |     "_defaultOrder": 55,
1172 |     "_isFastLaunch": false,
1173 |     "category": "Accelerated computing",
1174 |     "gpuNum": 8,
1175 |     "hideHardwareSpecs": false,
1176 |     "memoryGiB": 1152,
1177 |     "name": "ml.p4d.24xlarge",
1178 |     "vcpuNum": 96
1179 |    },
1180 |    {
1181 |     "_defaultOrder": 56,
1182 |     "_isFastLaunch": false,
1183 |     "category": "Accelerated computing",
1184 |     "gpuNum": 8,
1185 |     "hideHardwareSpecs": false,
1186 |     "memoryGiB": 1152,
1187 |     "name": "ml.p4de.24xlarge",
1188 |     "vcpuNum": 96
1189 |    }
1190 |   ],
1191 |   "instance_type": "ml.t3.medium",
1192 |   "kernelspec": {
1193 |    "display_name": "Python 3 (Data Science 3.0)",
1194 |    "language": "python",
1195 |    "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/sagemaker-data-science-310-v1"
1196 |   },
1197 |   "language_info": {
1198 |    "codemirror_mode": {
1199 |     "name": "ipython",
1200 |     "version": 3
1201 |    },
1202 |    "file_extension": ".py",
1203 |    "mimetype": "text/x-python",
1204 |    "name": "python",
1205 |    "nbconvert_exporter": "python",
1206 |    "pygments_lexer": "ipython3",
1207 |    "version": "3.10.6"
1208 |   }
1209 |  },
1210 |  "nbformat": 4,
1211 |  "nbformat_minor": 5
1212 | }
1213 | 


--------------------------------------------------------------------------------
/python/custom-queries/samples/checks-annotations.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/samples/checks-annotations.zip


--------------------------------------------------------------------------------
/python/custom-queries/samples/checks-samples.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/samples/checks-samples.zip


--------------------------------------------------------------------------------
/python/custom-queries/screenshots/checks-notebook-step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step1.png


--------------------------------------------------------------------------------
/python/custom-queries/screenshots/checks-notebook-step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step2.png


--------------------------------------------------------------------------------
/python/custom-queries/screenshots/checks-notebook-step5_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step5_1.png


--------------------------------------------------------------------------------
/python/custom-queries/screenshots/checks-notebook-step6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step6.png


--------------------------------------------------------------------------------
/python/custom-queries/screenshots/checks-notebook-step7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step7.png


--------------------------------------------------------------------------------
/python/custom-queries/screenshots/checks-notebook-step8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step8.png


--------------------------------------------------------------------------------
/python/employmentapp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/employmentapp.png


--------------------------------------------------------------------------------
/python/expense.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/expense.png


--------------------------------------------------------------------------------
/python/extraction-parsers/samples/CMS1500-sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/extraction-parsers/samples/CMS1500-sample.png


--------------------------------------------------------------------------------
/python/extraction-parsers/samples/ub-04-Form-sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/extraction-parsers/samples/ub-04-Form-sample.png


--------------------------------------------------------------------------------
/python/extraction-parsers/ub04-parser.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Parser for UB04 or CMS-1450 Form\n",
  8 |     "This notebook will walk you through sample code to parse the UB04 or CMS-1450 Form.  \n",
  9 |     "The CMS-1450 form (aka UB-04) is used by institutional providers to bill a Medicare fiscal intermediary when a provider qualifies for a waiver from the Administrative Simplification Compliance Act requirement for electronic submission of claims.\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 43,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "!python -m pip install amazon-textract-caller --upgrade\n",
 19 |     "!python -m pip install amazon-textract-response-parser --upgrade"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import boto3, json\n",
 29 |     "from textractcaller.t_call import call_textract, Textract_Features, Query, QueriesConfig, Adapter, AdaptersConfig\n",
 30 |     "\n",
 31 |     "import pandas as pd\n",
 32 |     "import trp\n",
 33 |     "from trp import Document\n",
 34 |     "import trp.trp2 as t2\n",
 35 |     "from trp.trp2 import TDocument, TDocumentSchema, TBlock, TGeometry, TBoundingBox, TPoint\n",
 36 |     "from trp.t_pipeline import order_blocks_by_geo_x_y\n",
 37 |     "from textractprettyprinter.t_pretty_print import Pretty_Print_Table_Format, Textract_Pretty_Print, get_forms_string, convert_table_to_kv_dict, convert_table_to_list\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "\n",
 41 |     "session = boto3.Session(profile_name='kmascar+training-Admin')\n",
 42 |     "textract = boto3.client('textract')\n",
 43 |     "\n",
 44 |     "textract_json = call_textract(input_document=\"samples/ub-04-Form-sample.png\", features = [Textract_Features.FORMS, Textract_Features.TABLES], boto3_textract_client=textract)\n",
 45 |     "print(json.dumps(textract_json, indent=2))\n"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "### Analyzing the UB04 Textract JSON Response: Order of elements\n",
 53 |     "On Analyzing the structured JSON output, you will notice that the order of the response in not in the reading order. To order this correctly, we will use the `order_blocks_by_geo_x_y` function."
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 56,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "|----------------------------------|---------------|\n",
 66 |       "| Key                              | Value         |\n",
 67 |       "| 1                                |               |\n",
 68 |       "| 2                                |               |\n",
 69 |       "| 3a PAT. CNTL #                   |               |\n",
 70 |       "| 4 TYPE OF BILL                   |               |\n",
 71 |       "| b. MED. REC. #                   |               |\n",
 72 |       "| 7                                |               |\n",
 73 |       "| 5 FED. TAX NO.                   |               |\n",
 74 |       "| FROM                             |               |\n",
 75 |       "| THROUGH                          |               |\n",
 76 |       "| a                                |               |\n",
 77 |       "| a                                |               |\n",
 78 |       "| b                                |               |\n",
 79 |       "| b                                |               |\n",
 80 |       "| C                                |               |\n",
 81 |       "| d                                |               |\n",
 82 |       "| e                                |               |\n",
 83 |       "| 29 ACDT STATE                    |               |\n",
 84 |       "| 30                               |               |\n",
 85 |       "| 10 BIRTHDATE                     |               |\n",
 86 |       "| 11 SEX                           |               |\n",
 87 |       "| 16 DHR                           |               |\n",
 88 |       "| 17 STAT                          |               |\n",
 89 |       "| 12 DATE                          |               |\n",
 90 |       "| 13 HR                            |               |\n",
 91 |       "| 14 TYPE                          |               |\n",
 92 |       "| 15 SRC                           |               |\n",
 93 |       "| 18                               | NOT_SELECTED  |\n",
 94 |       "| 19                               |               |\n",
 95 |       "| 20                               |               |\n",
 96 |       "| 21                               |               |\n",
 97 |       "| 22                               |               |\n",
 98 |       "| 23                               |               |\n",
 99 |       "| 24                               |               |\n",
100 |       "| 25                               |               |\n",
101 |       "| 26                               |               |\n",
102 |       "| 27                               |               |\n",
103 |       "| 28                               |               |\n",
104 |       "| 31 CODE                          | 04 05         |\n",
105 |       "| OCCURRENCE DATE                  | 111111 222222 |\n",
106 |       "| 32 CODE                          | 06 07         |\n",
107 |       "| OCCURRENCE DATE                  | 333333 444444 |\n",
108 |       "| 33 CODE                          |               |\n",
109 |       "| OCCURRENCE DATE                  |               |\n",
110 |       "| CODE                             |               |\n",
111 |       "| OCCURRENCE DATE                  |               |\n",
112 |       "| 35 CODE                          |               |\n",
113 |       "| OCCURRENCE FROM                  |               |\n",
114 |       "| SPAN THROUGH                     |               |\n",
115 |       "| 36 CODE                          |               |\n",
116 |       "| OCCURRENCE FROM                  |               |\n",
117 |       "| SPAN THROUGH                     |               |\n",
118 |       "| 37                               |               |\n",
119 |       "| 38                               |               |\n",
120 |       "| 39 CODE                          |               |\n",
121 |       "| VALUE CODES AMOUNT               |               |\n",
122 |       "| CODE                             |               |\n",
123 |       "| VALUE CODES AMOUNT               |               |\n",
124 |       "| 41 CODE                          |               |\n",
125 |       "| VALUE CODES AMOUNT               |               |\n",
126 |       "| 42 REV. CD.                      |               |\n",
127 |       "| 43 DESCRIPTION                   |               |\n",
128 |       "| 44 HCPCS RATE HIPPS CODE         |               |\n",
129 |       "| 45 SERV. DATE                    |               |\n",
130 |       "| 46 SERV. UNITS                   |               |\n",
131 |       "| 47 TOTAL CHARGES                 |               |\n",
132 |       "| 48 NON-COVERED CHARGES           |               |\n",
133 |       "| 49                               |               |\n",
134 |       "| PAGE                             |               |\n",
135 |       "| OF                               |               |\n",
136 |       "| CREATION DATE                    |               |\n",
137 |       "| TOTALS                           |               |\n",
138 |       "| 50 PAYER NAME                    |               |\n",
139 |       "| 51 HEALTH PLAN ID                |               |\n",
140 |       "| 52 REL INFO                      |               |\n",
141 |       "| 54 PRIOR PAYMENTS                |               |\n",
142 |       "| 55 EST. AMOUNT DUE               |               |\n",
143 |       "| 56 NPI                           |               |\n",
144 |       "| 57                               |               |\n",
145 |       "| OTHER                            |               |\n",
146 |       "| PRV ID                           |               |\n",
147 |       "| 58 INSURED'S NAME                |               |\n",
148 |       "| 59 P.REL                         |               |\n",
149 |       "| 60 INSURED'S UNIQUE ID           |               |\n",
150 |       "| 61 GROUP NAME                    |               |\n",
151 |       "| 62 INSURANCE GROUP NO.           |               |\n",
152 |       "| 63 TREATMENT AUTHORIZATION CODES |               |\n",
153 |       "| 64 DOCUMENT CONTROL NUMBER       |               |\n",
154 |       "| 65 EMPLOYER NAME                 |               |\n",
155 |       "| 66 DX                            | 67 A          |\n",
156 |       "| 68                               |               |\n",
157 |       "| 69 ADMIT DX                      |               |\n",
158 |       "| 70 PATIENT REASON DX             | NOT_SELECTED  |\n",
159 |       "| 71 PPS CODE                      |               |\n",
160 |       "| 72 ECI                           | NOT_SELECTED  |\n",
161 |       "| 73                               |               |\n",
162 |       "| 75                               |               |\n",
163 |       "| 76 ATTENDING NPI                 |               |\n",
164 |       "| QUAL                             |               |\n",
165 |       "| CODE                             |               |\n",
166 |       "| DATE                             |               |\n",
167 |       "| CODE                             |               |\n",
168 |       "| DATE                             |               |\n",
169 |       "| CODE                             |               |\n",
170 |       "| DATE                             |               |\n",
171 |       "| LAST                             |               |\n",
172 |       "| FIRST                            |               |\n",
173 |       "| 77 OPERATING NPI                 |               |\n",
174 |       "| QUAL                             |               |\n",
175 |       "| CODE                             |               |\n",
176 |       "| DATE                             |               |\n",
177 |       "| CODE                             |               |\n",
178 |       "| DATE                             |               |\n",
179 |       "| CODE                             |               |\n",
180 |       "| DATE                             |               |\n",
181 |       "| LAST                             |               |\n",
182 |       "| FIRST                            |               |\n",
183 |       "| 80 REMARKS                       |               |\n",
184 |       "| 81CC a                           |               |\n",
185 |       "| 78 OTHER                         |               |\n",
186 |       "| NPI                              |               |\n",
187 |       "| QUAL                             |               |\n",
188 |       "| b                                |               |\n",
189 |       "| LAST                             |               |\n",
190 |       "| FIRST                            |               |\n",
191 |       "| C                                |               |\n",
192 |       "| 79 OTHER                         |               |\n",
193 |       "| NPI                              |               |\n",
194 |       "| QUAL                             |               |\n",
195 |       "| d                                |               |\n",
196 |       "| LAST                             |               |\n",
197 |       "| FIRST                            |               |\n",
198 |       "\n",
199 |       "\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "t_doc = TDocumentSchema().load(textract_json)\n",
205 |     "ordered_doc = order_blocks_by_geo_x_y(t_doc)\n",
206 |     "print(get_forms_string(TDocumentSchema().dump(ordered_doc)))\n",
207 |     "\n",
208 |     "trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc))\n",
209 |     "\n"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "### Analyzing the UB04 Textract JSON Response: Complexity\n",
217 |     "UB04 form is a complex form with many identical key values, making it difficult to differentiate between them. Additionally, `8 PATIENT NAME` and `8 PATIENT ADDRESS` both contain fields `a`, `b`, `c` which we would like to map back to their respective sections.\n",
218 |     "\n",
219 |     "**Utility Functions:**  \n",
220 |     "We will now walk through utilitity functions that use our code repositories for Textract GeoFinder, Textract Pretty Printer and Textract Response Parser to parse hierarchical key values that are adjacent to each other or in an area"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "!python -m pip install amazon-textract-geofinder --upgrade\n",
230 |     "!python -m pip install amazon-textract-prettyprinter --upgrade"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 57,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "from textractgeofinder.tgeofinder import KeyValue, TGeoFinder, AreaSelection, SelectionElement\n",
240 |     "from enum import Enum, auto\n",
241 |     "from typing import List\n",
242 |     "\n",
243 |     "\n",
244 |     "def set_hierarchy_kv(list_kv: list[KeyValue], t_document: TDocument, page_block: TBlock, prefix: str = \"DEFAULT\"):\n",
245 |     "    for x in list_kv:\n",
246 |     "        # print(f\"{x.key.original_text} updated to {prefix}_{x.key.original_text}\")\n",
247 |     "        t_document.add_virtual_key_for_existing_key(key_name=f\"{prefix}_{x.key.original_text}\",\n",
248 |     "                                                    existing_key=t_document.get_block_by_id(x.key.id),\n",
249 |     "                                                    page_block=page_block)\n",
250 |     "\n",
251 |     "def set_adjacent_hkv(geofinder_doc: TGeoFinder, t_document: TDocument, phrase: str, number_of_keys:int=1, direction: str = 'RIGHT', prefix = None):\n",
252 |     "    list_phrase_tword = geofinder_doc.find_phrase_on_page(phrase)\n",
253 |     "    for phrase_tword in list_phrase_tword:\n",
254 |     "        # print(phrase_tword)\n",
255 |     "        if direction == 'RIGHT':\n",
256 |     "            form_fields = geofinder_doc.get_form_fields_to_the_right(word = phrase_tword, xmax = 1000, number_of_keys = number_of_keys)\n",
257 |     "        elif direction == 'BELOW':\n",
258 |     "            form_fields = geofinder_doc.get_form_fields_below(word = phrase_tword, ymax = 1000, number_of_keys = number_of_keys)\n",
259 |     "        prefix = phrase if prefix is None else prefix\n",
260 |     "        # print(f\"set_adjacent_hkv, phrasess: {phrase_tword}, form_fields:{form_fields}\")\n",
261 |     "        set_hierarchy_kv(list_kv=form_fields, t_document=t_document, prefix=prefix, page_block=t_document.pages[0])\n",
262 |     "\n",
263 |     "class Area_Constraint(Enum):\n",
264 |     "    WIDTH_PAGE_MIN = auto()\n",
265 |     "    WIDTH_PAGE_MAX = auto()\n",
266 |     "    HEIGHT_PAGE_MIN = auto()\n",
267 |     "    HEIGHT_PAGE_MAX = auto()\n",
268 |     "    INCLUDE_TOP_LEFT_PHRASE = auto()\n",
269 |     "    INCLUDE_LOWER_RIGHT_PHRASE = auto()\n",
270 |     "        \n",
271 |     "def set_area_hkv(geofinder_doc: TGeoFinder, t_document: TDocument, top_left_phrase: str, lower_right_phrase: str, area_constraint: List[Area_Constraint]=list(), prefix: str=None):\n",
272 |     "    top_left_phrase_tword = geofinder_doc.find_phrase_on_page(top_left_phrase)[0]\n",
273 |     "    lower_right_phrase_tword = geofinder_doc.find_phrase_on_page(lower_right_phrase)[0]\n",
274 |     "\n",
275 |     "    top_left_coord = dict()\n",
276 |     "    lower_right_coord = dict()\n",
277 |     "    if area_constraint:\n",
278 |     "        if Area_Constraint.WIDTH_PAGE_MIN in area_constraint:\n",
279 |     "            top_left_coord[\"x\"] = 0\n",
280 |     "        if Area_Constraint.HEIGHT_PAGE_MIN in area_constraint:\n",
281 |     "            top_left_coord[\"y\"] = 0\n",
282 |     "        if Area_Constraint.WIDTH_PAGE_MAX in area_constraint:\n",
283 |     "            lower_right_coord[\"x\"] = geofinder_doc.doc_width\n",
284 |     "        if Area_Constraint.HEIGHT_PAGE_MAX in area_constraint:\n",
285 |     "            lower_right_coord[\"y\"] = geofinder_doc.doc_height\n",
286 |     "        if Area_Constraint.INCLUDE_TOP_LEFT_PHRASE in area_constraint:\n",
287 |     "            if \"x\" not in top_left_coord:\n",
288 |     "                top_left_coord[\"x\"] = top_left_phrase_tword.xmin\n",
289 |     "            if \"y\" not in top_left_coord:\n",
290 |     "                top_left_coord[\"y\"] = top_left_phrase_tword.ymin\n",
291 |     "        if Area_Constraint.INCLUDE_LOWER_RIGHT_PHRASE in area_constraint:\n",
292 |     "            if \"x\" not in lower_right_coord:\n",
293 |     "                lower_right_coord[\"x\"] = lower_right_phrase_tword.xmax\n",
294 |     "            if \"y\" not in lower_right_coord:\n",
295 |     "                lower_right_coord[\"y\"] = lower_right_phrase_tword.ymax\n",
296 |     "\n",
297 |     "    top_left_coord.setdefault(\"x\", top_left_phrase_tword.xmax)\n",
298 |     "    top_left_coord.setdefault(\"y\", top_left_phrase_tword.ymax)\n",
299 |     "    lower_right_coord.setdefault(\"x\", lower_right_phrase_tword.xmin)\n",
300 |     "    lower_right_coord.setdefault(\"y\", lower_right_phrase_tword.ymin)\n",
301 |     "\n",
302 |     "    top_left = TPoint(y=top_left_coord[\"y\"], x=top_left_coord[\"x\"])\n",
303 |     "    lower_right = TPoint(y=lower_right_coord[\"y\"], x=lower_right_coord[\"x\"])\n",
304 |     "\n",
305 |     "    form_fields = geofinder_doc.get_form_fields_in_area(\n",
306 |     "                    area_selection=AreaSelection(top_left=top_left, lower_right=lower_right, page_number = 1))\n",
307 |     "    prefix = top_left_phrase if prefix is None else prefix\n",
308 |     "    # print(f\"set_area_hkv, phrases: {top_left_phrase_tword}, {lower_right_phrase_tword}, form_fields:{form_fields}\")\n",
309 |     "    set_hierarchy_kv(list_kv=form_fields, t_document=t_document, prefix=prefix, page_block=t_document.pages[0])\n",
310 |     "\n",
311 |     "def get_cell_with_text(geofinder_doc: TGeoFinder, t_document: TDocument, phrase: str):\n",
312 |     "    list_phrase_tword = geofinder_doc.find_phrase_on_page(phrase)\n",
313 |     "    # print(list_phrase_tword)\n",
314 |     "    for phrase_tword in list_phrase_tword:\n",
315 |     "        # print(\"calling table cells\")\n",
316 |     "        table_cells = geofinder_doc.get_cells_with_text(word = phrase_tword, number_of_cells = 1)\n",
317 |     "    # print(\"column_index:\",t_document.get_block_by_id(table_cells[0].id).column_index)\n",
318 |     "    \n",
319 |     "    # geofinder_doc.get_exact_table(id = \"588328c2-0ed5-44d0-b35d-849b90dfb226\")\n",
320 |     "    return table_cells[0].id\n",
321 |     "\n",
322 |     "def convert_table_to_key_value(geofinder_doc: TGeoFinder, t_document: TDocument, phrase: str):\n",
323 |     "    cell_ids = get_cell_with_text(geofinder_doc=geofinder_doc, t_document=t_document, phrase=phrase)\n",
324 |     "    table_kv_dict = dict()\n",
325 |     "    trp_doc = trp.Document(TDocumentSchema().dump(t_doc))\n",
326 |     "    for page in trp_doc.pages:\n",
327 |     "        for table in page.tables:\n",
328 |     "            for r, row in enumerate(table.rows):\n",
329 |     "                for c, cell in enumerate(row.cells):\n",
330 |     "                    if cell.id in cell_ids:\n",
331 |     "                        table_kv_dict = convert_table_to_kv_dict(table, ignore_table_summary=True)\n",
332 |     "                        print(json.dumps(table_kv_dict, indent=2))\n",
333 |     "    return table_kv_dict\n"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {},
339 |    "source": [
340 |     "### Writing an opinionated parser for UB04\n",
341 |     "We will now write an opinionate function `parse_ub04` that will use the right utility functions defined above for the respective field and extract the output"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 59,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "def parse_ub04(textract_json):\n",
351 |     "    t_document = TDocumentSchema().load(textract_json)\n",
352 |     "    doc_height = 1000\n",
353 |     "    doc_width = 1000\n",
354 |     "    geofinder_doc = TGeoFinder(textract_json, doc_height=doc_height, doc_width=doc_width)\n",
355 |     "\n",
356 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"3a PAT CNTL\", number_of_keys=1, direction=\"BELOW\", prefix=\"3\")\n",
357 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"6 STATEMENT COVERS PERIOD\", number_of_keys=2, direction=\"BELOW\")\n",
358 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"8 PATIENT NAME\")\n",
359 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"8 PATIENT NAME\", direction=\"BELOW\")\n",
360 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"9 PATIENT ADDRESS\")\n",
361 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"9 PATIENT ADDRESS\", direction=\"BELOW\")\n",
362 |     "\n",
363 |     "    area_constraint = [Area_Constraint.INCLUDE_TOP_LEFT_PHRASE, Area_Constraint.WIDTH_PAGE_MAX]\n",
364 |     "    set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"9 PATIENT ADDRESS\", lower_right_phrase=\"29 ACDT\", area_constraint=area_constraint)\n",
365 |     "\n",
366 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"31 CODE\", prefix=\"31\")\n",
367 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"32 CODE\", prefix=\"32\")\n",
368 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"33 CODE\", prefix=\"33\")\n",
369 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"34 CODE\", prefix=\"34\")\n",
370 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"35 CODE\", number_of_keys=2, prefix=\"35\")\n",
371 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"36 CODE\", number_of_keys=2, prefix=\"36\")\n",
372 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"39 CODE\", prefix=\"39\")\n",
373 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"40 CODE\", prefix=\"40\")\n",
374 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"41 CODE\", prefix=\"41\")\n",
375 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"56 NPI 57\", number_of_keys=2, direction=\"BELOW\", prefix=57)\n",
376 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"74 PRINCIPAL PROCEDURE\", number_of_keys=2, direction=\"BELOW\", prefix=\"74 PRINCIPAL\")\n",
377 |     "\n",
378 |     "    set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"74 PRINCIPAL PROCEDURE\", lower_right_phrase=\"77 OPERATING\", area_constraint=None, prefix=\"74ab\")\n",
379 |     "\n",
380 |     "    area_constraint = [Area_Constraint.WIDTH_PAGE_MIN]\n",
381 |     "    set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"76 ATTENDING\", lower_right_phrase=\"78 OTHER\", area_constraint=area_constraint, prefix=\"74cde\")\n",
382 |     "\n",
383 |     "    area_constraint = [Area_Constraint.INCLUDE_TOP_LEFT_PHRASE, Area_Constraint.WIDTH_PAGE_MAX]\n",
384 |     "    set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"76 ATTENDING\", lower_right_phrase=\"77 OPERATING\", area_constraint=area_constraint, prefix=\"76\")\n",
385 |     "    set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"77 OPERATING\", lower_right_phrase=\"78 OTHER\", area_constraint=area_constraint, prefix=\"77\")\n",
386 |     "    set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"78 OTHER\", lower_right_phrase=\"79 OTHER\", area_constraint=area_constraint, prefix=\"78\")\n",
387 |     "\n",
388 |     "    area_constraint = [Area_Constraint.INCLUDE_TOP_LEFT_PHRASE, Area_Constraint.WIDTH_PAGE_MAX, Area_Constraint.HEIGHT_PAGE_MAX]\n",
389 |     "    set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"79 OTHER\", lower_right_phrase=\"LAST\", area_constraint=area_constraint, prefix=\"79\")\n",
390 |     "\n",
391 |     "    set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"81 CC\", number_of_keys=3, direction=\"BELOW\")\n",
392 |     "\n",
393 |     "\n",
394 |     "    convert_table_to_key_value(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"REV CD\")\n",
395 |     "    convert_table_to_key_value(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"66 DX\")\n",
396 |     "\n",
397 |     "    return order_blocks_by_geo_x_y(t_document)\n"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "### Calling the post-processing parser\n",
405 |     "Let's call the UB04 parser function and analyze the response."
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 60,
411 |    "metadata": {},
412 |    "outputs": [
413 |     {
414 |      "name": "stdout",
415 |      "output_type": "stream",
416 |      "text": [
417 |       "get_cells_with_text: found keys: [TWord(text='42 rev. cd. ', original_text='42 REV. CD. ', text_type='cell', confidence=92.431640625, id='98468515-9121-4f8f-9e28-eacc209de55f', xmin=17, ymin=259, xmax=72, ymax=274, page_number=1, doc_width=1000, doc_height=1000, child_relationships='', reference=None, resolver=None)]\n",
418 |       "[\n",
419 |       "  {\n",
420 |       "    \"0\": \"\",\n",
421 |       "    \"42 REV. CD. \": \"\",\n",
422 |       "    \"43 DESCRIPTION \": \"\",\n",
423 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
424 |       "    \"45 SERV. DATE \": \"\",\n",
425 |       "    \"46 SERV. UNITS \": \"\",\n",
426 |       "    \"47 TOTAL CHARGES \": \"\",\n",
427 |       "    \"\": \"\",\n",
428 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
429 |       "    \"49 \": \"\"\n",
430 |       "  },\n",
431 |       "  {\n",
432 |       "    \"0\": \"\",\n",
433 |       "    \"42 REV. CD. \": \"\",\n",
434 |       "    \"43 DESCRIPTION \": \"\",\n",
435 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
436 |       "    \"45 SERV. DATE \": \"\",\n",
437 |       "    \"46 SERV. UNITS \": \"\",\n",
438 |       "    \"47 TOTAL CHARGES \": \"\",\n",
439 |       "    \"\": \"\",\n",
440 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
441 |       "    \"49 \": \"\"\n",
442 |       "  },\n",
443 |       "  {\n",
444 |       "    \"0\": \"\",\n",
445 |       "    \"42 REV. CD. \": \"\",\n",
446 |       "    \"43 DESCRIPTION \": \"\",\n",
447 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
448 |       "    \"45 SERV. DATE \": \"\",\n",
449 |       "    \"46 SERV. UNITS \": \"\",\n",
450 |       "    \"47 TOTAL CHARGES \": \"\",\n",
451 |       "    \"\": \"\",\n",
452 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
453 |       "    \"49 \": \"\"\n",
454 |       "  },\n",
455 |       "  {\n",
456 |       "    \"0\": \"\",\n",
457 |       "    \"42 REV. CD. \": \"\",\n",
458 |       "    \"43 DESCRIPTION \": \"\",\n",
459 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
460 |       "    \"45 SERV. DATE \": \"\",\n",
461 |       "    \"46 SERV. UNITS \": \"\",\n",
462 |       "    \"47 TOTAL CHARGES \": \"\",\n",
463 |       "    \"\": \"\",\n",
464 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
465 |       "    \"49 \": \"\"\n",
466 |       "  },\n",
467 |       "  {\n",
468 |       "    \"0\": \"\",\n",
469 |       "    \"42 REV. CD. \": \"\",\n",
470 |       "    \"43 DESCRIPTION \": \"\",\n",
471 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
472 |       "    \"45 SERV. DATE \": \"\",\n",
473 |       "    \"46 SERV. UNITS \": \"\",\n",
474 |       "    \"47 TOTAL CHARGES \": \"\",\n",
475 |       "    \"\": \"\",\n",
476 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
477 |       "    \"49 \": \"\"\n",
478 |       "  },\n",
479 |       "  {\n",
480 |       "    \"0\": \"\",\n",
481 |       "    \"42 REV. CD. \": \"\",\n",
482 |       "    \"43 DESCRIPTION \": \"\",\n",
483 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
484 |       "    \"45 SERV. DATE \": \"\",\n",
485 |       "    \"46 SERV. UNITS \": \"\",\n",
486 |       "    \"47 TOTAL CHARGES \": \"\",\n",
487 |       "    \"\": \"\",\n",
488 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
489 |       "    \"49 \": \"\"\n",
490 |       "  },\n",
491 |       "  {\n",
492 |       "    \"0\": \"\",\n",
493 |       "    \"42 REV. CD. \": \"\",\n",
494 |       "    \"43 DESCRIPTION \": \"\",\n",
495 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
496 |       "    \"45 SERV. DATE \": \"\",\n",
497 |       "    \"46 SERV. UNITS \": \"\",\n",
498 |       "    \"47 TOTAL CHARGES \": \"\",\n",
499 |       "    \"\": \"\",\n",
500 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
501 |       "    \"49 \": \"\"\n",
502 |       "  },\n",
503 |       "  {\n",
504 |       "    \"0\": \"\",\n",
505 |       "    \"42 REV. CD. \": \"\",\n",
506 |       "    \"43 DESCRIPTION \": \"\",\n",
507 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
508 |       "    \"45 SERV. DATE \": \"\",\n",
509 |       "    \"46 SERV. UNITS \": \"\",\n",
510 |       "    \"47 TOTAL CHARGES \": \"\",\n",
511 |       "    \"\": \"\",\n",
512 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
513 |       "    \"49 \": \"\"\n",
514 |       "  },\n",
515 |       "  {\n",
516 |       "    \"0\": \"\",\n",
517 |       "    \"42 REV. CD. \": \"\",\n",
518 |       "    \"43 DESCRIPTION \": \"\",\n",
519 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
520 |       "    \"45 SERV. DATE \": \"\",\n",
521 |       "    \"46 SERV. UNITS \": \"\",\n",
522 |       "    \"47 TOTAL CHARGES \": \"\",\n",
523 |       "    \"\": \"\",\n",
524 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
525 |       "    \"49 \": \"\"\n",
526 |       "  },\n",
527 |       "  {\n",
528 |       "    \"0\": \"\",\n",
529 |       "    \"42 REV. CD. \": \"\",\n",
530 |       "    \"43 DESCRIPTION \": \"\",\n",
531 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
532 |       "    \"45 SERV. DATE \": \"\",\n",
533 |       "    \"46 SERV. UNITS \": \"\",\n",
534 |       "    \"47 TOTAL CHARGES \": \"\",\n",
535 |       "    \"\": \"\",\n",
536 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
537 |       "    \"49 \": \"\"\n",
538 |       "  },\n",
539 |       "  {\n",
540 |       "    \"0\": \"\",\n",
541 |       "    \"42 REV. CD. \": \"\",\n",
542 |       "    \"43 DESCRIPTION \": \"\",\n",
543 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
544 |       "    \"45 SERV. DATE \": \"\",\n",
545 |       "    \"46 SERV. UNITS \": \"\",\n",
546 |       "    \"47 TOTAL CHARGES \": \"\",\n",
547 |       "    \"\": \"\",\n",
548 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
549 |       "    \"49 \": \"\"\n",
550 |       "  },\n",
551 |       "  {\n",
552 |       "    \"0\": \"\",\n",
553 |       "    \"42 REV. CD. \": \"\",\n",
554 |       "    \"43 DESCRIPTION \": \"\",\n",
555 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
556 |       "    \"45 SERV. DATE \": \"\",\n",
557 |       "    \"46 SERV. UNITS \": \"\",\n",
558 |       "    \"47 TOTAL CHARGES \": \"\",\n",
559 |       "    \"\": \"\",\n",
560 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
561 |       "    \"49 \": \"\"\n",
562 |       "  },\n",
563 |       "  {\n",
564 |       "    \"0\": \"\",\n",
565 |       "    \"42 REV. CD. \": \"\",\n",
566 |       "    \"43 DESCRIPTION \": \"\",\n",
567 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
568 |       "    \"45 SERV. DATE \": \"\",\n",
569 |       "    \"46 SERV. UNITS \": \"\",\n",
570 |       "    \"47 TOTAL CHARGES \": \"\",\n",
571 |       "    \"\": \"\",\n",
572 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
573 |       "    \"49 \": \"\"\n",
574 |       "  },\n",
575 |       "  {\n",
576 |       "    \"0\": \"\",\n",
577 |       "    \"42 REV. CD. \": \"\",\n",
578 |       "    \"43 DESCRIPTION \": \"\",\n",
579 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
580 |       "    \"45 SERV. DATE \": \"\",\n",
581 |       "    \"46 SERV. UNITS \": \"\",\n",
582 |       "    \"47 TOTAL CHARGES \": \"\",\n",
583 |       "    \"\": \"\",\n",
584 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
585 |       "    \"49 \": \"\"\n",
586 |       "  },\n",
587 |       "  {\n",
588 |       "    \"0\": \"\",\n",
589 |       "    \"42 REV. CD. \": \"\",\n",
590 |       "    \"43 DESCRIPTION \": \"\",\n",
591 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
592 |       "    \"45 SERV. DATE \": \"\",\n",
593 |       "    \"46 SERV. UNITS \": \"\",\n",
594 |       "    \"47 TOTAL CHARGES \": \"\",\n",
595 |       "    \"\": \"\",\n",
596 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
597 |       "    \"49 \": \"\"\n",
598 |       "  },\n",
599 |       "  {\n",
600 |       "    \"0\": \"\",\n",
601 |       "    \"42 REV. CD. \": \"\",\n",
602 |       "    \"43 DESCRIPTION \": \"\",\n",
603 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
604 |       "    \"45 SERV. DATE \": \"\",\n",
605 |       "    \"46 SERV. UNITS \": \"\",\n",
606 |       "    \"47 TOTAL CHARGES \": \"\",\n",
607 |       "    \"\": \"\",\n",
608 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
609 |       "    \"49 \": \"\"\n",
610 |       "  },\n",
611 |       "  {\n",
612 |       "    \"0\": \"\",\n",
613 |       "    \"42 REV. CD. \": \"\",\n",
614 |       "    \"43 DESCRIPTION \": \"\",\n",
615 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
616 |       "    \"45 SERV. DATE \": \"\",\n",
617 |       "    \"46 SERV. UNITS \": \"\",\n",
618 |       "    \"47 TOTAL CHARGES \": \"\",\n",
619 |       "    \"\": \"\",\n",
620 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
621 |       "    \"49 \": \"\"\n",
622 |       "  },\n",
623 |       "  {\n",
624 |       "    \"0\": \"\",\n",
625 |       "    \"42 REV. CD. \": \"\",\n",
626 |       "    \"43 DESCRIPTION \": \"\",\n",
627 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
628 |       "    \"45 SERV. DATE \": \"\",\n",
629 |       "    \"46 SERV. UNITS \": \"\",\n",
630 |       "    \"47 TOTAL CHARGES \": \"\",\n",
631 |       "    \"\": \"\",\n",
632 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
633 |       "    \"49 \": \"\"\n",
634 |       "  },\n",
635 |       "  {\n",
636 |       "    \"0\": \"\",\n",
637 |       "    \"42 REV. CD. \": \"\",\n",
638 |       "    \"43 DESCRIPTION \": \"\",\n",
639 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
640 |       "    \"45 SERV. DATE \": \"\",\n",
641 |       "    \"46 SERV. UNITS \": \"\",\n",
642 |       "    \"47 TOTAL CHARGES \": \"\",\n",
643 |       "    \"\": \"\",\n",
644 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
645 |       "    \"49 \": \"\"\n",
646 |       "  },\n",
647 |       "  {\n",
648 |       "    \"0\": \"\",\n",
649 |       "    \"42 REV. CD. \": \"\",\n",
650 |       "    \"43 DESCRIPTION \": \"\",\n",
651 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
652 |       "    \"45 SERV. DATE \": \"\",\n",
653 |       "    \"46 SERV. UNITS \": \"\",\n",
654 |       "    \"47 TOTAL CHARGES \": \"\",\n",
655 |       "    \"\": \"\",\n",
656 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
657 |       "    \"49 \": \"\"\n",
658 |       "  },\n",
659 |       "  {\n",
660 |       "    \"0\": \"\",\n",
661 |       "    \"42 REV. CD. \": \"\",\n",
662 |       "    \"43 DESCRIPTION \": \"\",\n",
663 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
664 |       "    \"45 SERV. DATE \": \"\",\n",
665 |       "    \"46 SERV. UNITS \": \"\",\n",
666 |       "    \"47 TOTAL CHARGES \": \"\",\n",
667 |       "    \"\": \"\",\n",
668 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
669 |       "    \"49 \": \"\"\n",
670 |       "  },\n",
671 |       "  {\n",
672 |       "    \"0\": \"\",\n",
673 |       "    \"42 REV. CD. \": \"\",\n",
674 |       "    \"43 DESCRIPTION \": \"\",\n",
675 |       "    \"44 HCPCS RATE HIPPS CODE \": \"\",\n",
676 |       "    \"45 SERV. DATE \": \"\",\n",
677 |       "    \"46 SERV. UNITS \": \"\",\n",
678 |       "    \"47 TOTAL CHARGES \": \"\",\n",
679 |       "    \"\": \"\",\n",
680 |       "    \"48 NON-COVERED CHARGES \": \"\",\n",
681 |       "    \"49 \": \"\"\n",
682 |       "  }\n",
683 |       "]\n",
684 |       "get_cells_with_text: found keys: [TWord(text='66 dx ', original_text='66 DX ', text_type='cell', confidence=77.44140625, id='79ca5a64-1891-4a04-80f4-4ba1f48cd1e2', xmin=17, ymin=804, xmax=30, ymax=819, page_number=1, doc_width=1000, doc_height=1000, child_relationships='', reference=None, resolver=None)]\n",
685 |       "[\n",
686 |       "  {\n",
687 |       "    \"0\": \"66 DX \",\n",
688 |       "    \"1\": \"67 \",\n",
689 |       "    \"2\": \"\",\n",
690 |       "    \"3\": \"A \",\n",
691 |       "    \"4\": \"\",\n",
692 |       "    \"5\": \"\",\n",
693 |       "    \"6\": \"\",\n",
694 |       "    \"7\": \"NOT_SELECTED, \",\n",
695 |       "    \"8\": \"\",\n",
696 |       "    \"9\": \"\",\n",
697 |       "    \"10\": \"\",\n",
698 |       "    \"11\": \"\",\n",
699 |       "    \"12\": \"\",\n",
700 |       "    \"13\": \"\",\n",
701 |       "    \"14\": \"\",\n",
702 |       "    \"15\": \"NOT_SELECTED, \",\n",
703 |       "    \"16\": \"\",\n",
704 |       "    \"17\": \"NOT_SELECTED, \",\n",
705 |       "    \"18\": \"\",\n",
706 |       "    \"19\": \"68 \"\n",
707 |       "  },\n",
708 |       "  {\n",
709 |       "    \"0\": \"\",\n",
710 |       "    \"1\": \"\",\n",
711 |       "    \"2\": \"\",\n",
712 |       "    \"3\": \"\",\n",
713 |       "    \"4\": \"\",\n",
714 |       "    \"5\": \"\",\n",
715 |       "    \"6\": \"\",\n",
716 |       "    \"7\": \"\",\n",
717 |       "    \"8\": \"\",\n",
718 |       "    \"9\": \"\",\n",
719 |       "    \"10\": \"\",\n",
720 |       "    \"11\": \"NOT_SELECTED, \",\n",
721 |       "    \"12\": \"\",\n",
722 |       "    \"13\": \"NOT_SELECTED, \",\n",
723 |       "    \"14\": \"\",\n",
724 |       "    \"15\": \"\",\n",
725 |       "    \"16\": \"\",\n",
726 |       "    \"17\": \"NOT_SELECTED, \",\n",
727 |       "    \"18\": \"\",\n",
728 |       "    \"19\": \"\"\n",
729 |       "  }\n",
730 |       "]\n",
731 |       "|-----------------------------------|---------------|\n",
732 |       "| Key                               | Value         |\n",
733 |       "| 9 PATIENT ADDRESS_a               |               |\n",
734 |       "| 76_FIRST                          |               |\n",
735 |       "| 81 CC_C                           |               |\n",
736 |       "| 77_FIRST                          |               |\n",
737 |       "| 36_OCCURRENCE FROM                |               |\n",
738 |       "| 78_78 OTHER                       |               |\n",
739 |       "| 74cde_DATE                        |               |\n",
740 |       "| 74 PRINCIPAL_DATE                 |               |\n",
741 |       "| 76_LAST                           |               |\n",
742 |       "| 9 PATIENT ADDRESS_a               |               |\n",
743 |       "| 78_QUAL                           |               |\n",
744 |       "| 57_PRV ID                         |               |\n",
745 |       "| 35_SPAN THROUGH                   |               |\n",
746 |       "| 74ab_DATE                         |               |\n",
747 |       "| 9 PATIENT ADDRESS_b               |               |\n",
748 |       "| 31_32 CODE                        | 06 07         |\n",
749 |       "| 79_FIRST                          |               |\n",
750 |       "| 77_77 OPERATING NPI               |               |\n",
751 |       "| 77_QUAL                           |               |\n",
752 |       "| 78_FIRST                          |               |\n",
753 |       "| 57_OTHER                          |               |\n",
754 |       "| 74 PRINCIPAL_CODE                 |               |\n",
755 |       "| 8 PATIENT NAME_b                  |               |\n",
756 |       "| 39_VALUE CODES AMOUNT             |               |\n",
757 |       "| 81 CC_b                           |               |\n",
758 |       "| 74cde_DATE                        |               |\n",
759 |       "| 9 PATIENT ADDRESS_b               |               |\n",
760 |       "| 41_VALUE CODES AMOUNT             |               |\n",
761 |       "| 79_NPI                            |               |\n",
762 |       "| 79_LAST                           |               |\n",
763 |       "| 77_LAST                           |               |\n",
764 |       "| 78_LAST                           |               |\n",
765 |       "| 9 PATIENT ADDRESS_e               |               |\n",
766 |       "| 74ab_CODE                         |               |\n",
767 |       "| 74cde_CODE                        |               |\n",
768 |       "| 36_SPAN THROUGH                   |               |\n",
769 |       "| 33_OCCURRENCE DATE                |               |\n",
770 |       "| 34_OCCURRENCE DATE                |               |\n",
771 |       "| 3_b. MED. REC. #                  |               |\n",
772 |       "| 76_QUAL                           |               |\n",
773 |       "| 76_76 ATTENDING NPI               |               |\n",
774 |       "| 79_QUAL                           |               |\n",
775 |       "| 8 PATIENT NAME_a                  |               |\n",
776 |       "| 74ab_CODE                         |               |\n",
777 |       "| 40_VALUE CODES AMOUNT             |               |\n",
778 |       "| 79_79 OTHER                       |               |\n",
779 |       "| 74cde_DATE                        |               |\n",
780 |       "| 35_OCCURRENCE FROM                |               |\n",
781 |       "| 74cde_CODE                        |               |\n",
782 |       "| 6 STATEMENT COVERS PERIOD_FROM    |               |\n",
783 |       "| 9 PATIENT ADDRESS_d               |               |\n",
784 |       "| 81 CC_d                           |               |\n",
785 |       "| 6 STATEMENT COVERS PERIOD_THROUGH |               |\n",
786 |       "| 74cde_CODE                        |               |\n",
787 |       "| 78_NPI                            |               |\n",
788 |       "| 9 PATIENT ADDRESS_C               |               |\n",
789 |       "| 74ab_DATE                         |               |\n",
790 |       "| 32_OCCURRENCE DATE                | 333333 444444 |\n",
791 |       "| 1                                 |               |\n",
792 |       "| 2                                 |               |\n",
793 |       "| 3a PAT. CNTL #                    |               |\n",
794 |       "| 4 TYPE OF BILL                    |               |\n",
795 |       "| b. MED. REC. #                    |               |\n",
796 |       "| 7                                 |               |\n",
797 |       "| 5 FED. TAX NO.                    |               |\n",
798 |       "| FROM                              |               |\n",
799 |       "| THROUGH                           |               |\n",
800 |       "| a                                 |               |\n",
801 |       "| a                                 |               |\n",
802 |       "| b                                 |               |\n",
803 |       "| b                                 |               |\n",
804 |       "| C                                 |               |\n",
805 |       "| d                                 |               |\n",
806 |       "| e                                 |               |\n",
807 |       "| 29 ACDT STATE                     |               |\n",
808 |       "| 30                                |               |\n",
809 |       "| 10 BIRTHDATE                      |               |\n",
810 |       "| 11 SEX                            |               |\n",
811 |       "| 16 DHR                            |               |\n",
812 |       "| 17 STAT                           |               |\n",
813 |       "| 12 DATE                           |               |\n",
814 |       "| 13 HR                             |               |\n",
815 |       "| 14 TYPE                           |               |\n",
816 |       "| 15 SRC                            |               |\n",
817 |       "| 18                                | NOT_SELECTED  |\n",
818 |       "| 19                                |               |\n",
819 |       "| 20                                |               |\n",
820 |       "| 21                                |               |\n",
821 |       "| 22                                |               |\n",
822 |       "| 23                                |               |\n",
823 |       "| 24                                |               |\n",
824 |       "| 25                                |               |\n",
825 |       "| 26                                |               |\n",
826 |       "| 27                                |               |\n",
827 |       "| 28                                |               |\n",
828 |       "| 31 CODE                           | 04 05         |\n",
829 |       "| OCCURRENCE DATE                   | 111111 222222 |\n",
830 |       "| 32 CODE                           | 06 07         |\n",
831 |       "| OCCURRENCE DATE                   | 333333 444444 |\n",
832 |       "| 33 CODE                           |               |\n",
833 |       "| OCCURRENCE DATE                   |               |\n",
834 |       "| CODE                              |               |\n",
835 |       "| OCCURRENCE DATE                   |               |\n",
836 |       "| 35 CODE                           |               |\n",
837 |       "| OCCURRENCE FROM                   |               |\n",
838 |       "| SPAN THROUGH                      |               |\n",
839 |       "| 36 CODE                           |               |\n",
840 |       "| OCCURRENCE FROM                   |               |\n",
841 |       "| SPAN THROUGH                      |               |\n",
842 |       "| 37                                |               |\n",
843 |       "| 38                                |               |\n",
844 |       "| 39 CODE                           |               |\n",
845 |       "| VALUE CODES AMOUNT                |               |\n",
846 |       "| CODE                              |               |\n",
847 |       "| VALUE CODES AMOUNT                |               |\n",
848 |       "| 41 CODE                           |               |\n",
849 |       "| VALUE CODES AMOUNT                |               |\n",
850 |       "| 42 REV. CD.                       |               |\n",
851 |       "| 43 DESCRIPTION                    |               |\n",
852 |       "| 44 HCPCS RATE HIPPS CODE          |               |\n",
853 |       "| 45 SERV. DATE                     |               |\n",
854 |       "| 46 SERV. UNITS                    |               |\n",
855 |       "| 47 TOTAL CHARGES                  |               |\n",
856 |       "| 48 NON-COVERED CHARGES            |               |\n",
857 |       "| 49                                |               |\n",
858 |       "| PAGE                              |               |\n",
859 |       "| OF                                |               |\n",
860 |       "| CREATION DATE                     |               |\n",
861 |       "| TOTALS                            |               |\n",
862 |       "| 50 PAYER NAME                     |               |\n",
863 |       "| 51 HEALTH PLAN ID                 |               |\n",
864 |       "| 52 REL INFO                       |               |\n",
865 |       "| 54 PRIOR PAYMENTS                 |               |\n",
866 |       "| 55 EST. AMOUNT DUE                |               |\n",
867 |       "| 56 NPI                            |               |\n",
868 |       "| 57                                |               |\n",
869 |       "| OTHER                             |               |\n",
870 |       "| PRV ID                            |               |\n",
871 |       "| 58 INSURED'S NAME                 |               |\n",
872 |       "| 59 P.REL                          |               |\n",
873 |       "| 60 INSURED'S UNIQUE ID            |               |\n",
874 |       "| 61 GROUP NAME                     |               |\n",
875 |       "| 62 INSURANCE GROUP NO.            |               |\n",
876 |       "| 63 TREATMENT AUTHORIZATION CODES  |               |\n",
877 |       "| 64 DOCUMENT CONTROL NUMBER        |               |\n",
878 |       "| 65 EMPLOYER NAME                  |               |\n",
879 |       "| 66 DX                             | 67 A          |\n",
880 |       "| 68                                |               |\n",
881 |       "| 69 ADMIT DX                       |               |\n",
882 |       "| 70 PATIENT REASON DX              | NOT_SELECTED  |\n",
883 |       "| 71 PPS CODE                       |               |\n",
884 |       "| 72 ECI                            | NOT_SELECTED  |\n",
885 |       "| 73                                |               |\n",
886 |       "| 75                                |               |\n",
887 |       "| 76 ATTENDING NPI                  |               |\n",
888 |       "| QUAL                              |               |\n",
889 |       "| CODE                              |               |\n",
890 |       "| DATE                              |               |\n",
891 |       "| CODE                              |               |\n",
892 |       "| DATE                              |               |\n",
893 |       "| CODE                              |               |\n",
894 |       "| DATE                              |               |\n",
895 |       "| LAST                              |               |\n",
896 |       "| FIRST                             |               |\n",
897 |       "| 77 OPERATING NPI                  |               |\n",
898 |       "| QUAL                              |               |\n",
899 |       "| CODE                              |               |\n",
900 |       "| DATE                              |               |\n",
901 |       "| CODE                              |               |\n",
902 |       "| DATE                              |               |\n",
903 |       "| CODE                              |               |\n",
904 |       "| DATE                              |               |\n",
905 |       "| LAST                              |               |\n",
906 |       "| FIRST                             |               |\n",
907 |       "| 80 REMARKS                        |               |\n",
908 |       "| 81CC a                            |               |\n",
909 |       "| 78 OTHER                          |               |\n",
910 |       "| NPI                               |               |\n",
911 |       "| QUAL                              |               |\n",
912 |       "| b                                 |               |\n",
913 |       "| LAST                              |               |\n",
914 |       "| FIRST                             |               |\n",
915 |       "| C                                 |               |\n",
916 |       "| 79 OTHER                          |               |\n",
917 |       "| NPI                               |               |\n",
918 |       "| QUAL                              |               |\n",
919 |       "| d                                 |               |\n",
920 |       "| LAST                              |               |\n",
921 |       "| FIRST                             |               |\n",
922 |       "\n",
923 |       "\n"
924 |      ]
925 |     }
926 |    ],
927 |    "source": [
928 |     "t_doc = TDocumentSchema().load(textract_json)\n",
929 |     "ordered_doc = order_blocks_by_geo_x_y(t_doc)\n",
930 |     "trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc))\n",
931 |     "\n",
932 |     "final_t_document = parse_ub04(textract_json)\n",
933 |     "\n",
934 |     "print(get_forms_string(TDocumentSchema().dump(final_t_document)))\n"
935 |    ]
936 |   }
937 |  ],
938 |  "metadata": {
939 |   "kernelspec": {
940 |    "display_name": "trp-test",
941 |    "language": "python",
942 |    "name": "python3"
943 |   },
944 |   "language_info": {
945 |    "codemirror_mode": {
946 |     "name": "ipython",
947 |     "version": 3
948 |    },
949 |    "file_extension": ".py",
950 |    "mimetype": "text/x-python",
951 |    "name": "python",
952 |    "nbconvert_exporter": "python",
953 |    "pygments_lexer": "ipython3",
954 |    "version": "3.9.6"
955 |   }
956 |  },
957 |  "nbformat": 4,
958 |  "nbformat_minor": 4
959 | }
960 | 


--------------------------------------------------------------------------------
/python/medical-notes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/medical-notes.png


--------------------------------------------------------------------------------
/python/patient_intake_form_sample.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/patient_intake_form_sample.jpg


--------------------------------------------------------------------------------
/python/queries/insurance-card.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/queries/insurance-card.png


--------------------------------------------------------------------------------
/python/queries/mortgage-note.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/queries/mortgage-note.jpg


--------------------------------------------------------------------------------
/python/queries/paystub-questions_full.csv:
--------------------------------------------------------------------------------
 1 | What is the Pay Period Start Date?,PAYSTUB_START_DATE
 2 | What is the Pay Period End Date?,PAYSTUB_END_DATE
 3 | What is the Pay Date?,PAYSTUB_PAY_DATE
 4 | What is the Employee Name?,PAYSTUB_EMPLOYEE_NAME
 5 | What is the Employee Address?,PAYSTUB_EMPLOYEE_ADDRESS
 6 | What is the Company Name?,PAYSTUB_EMPLOYER_NAME
 7 | What is the Company Address?,PAYSTUB_EMPLOYER_ADDRESS
 8 | What is the Federal Filing Status?,PAYSTUB_FEDERAL_FILING
 9 | What is the State Filing Status?,PAYSTUB_STATE_FILING
10 | What is the Current Gross Pay?,PAYSTUB_CURRENT_GROSS
11 | What is the YTD Gross Pay?,PAYSTUB_YTD_GROSS
12 | What is the Current Net Pay?,PAYSTUB_CURRENT_NET
13 | What is the YTD Net Pay?,PAYSTUB_YTD_NET
14 | What are the warnings?,PAYSTUB_WARNINGS
15 | What are the Messages?,PAYSTUB_MESSAGES
16 | What are the Notes?,PAYSTUB_NOTES
17 | What are the Contact?,PAYSTUB_CONTACT
18 | what is the regular hourly rate?,PAYSTUB_REGULAR_HOURS_RATE
19 | what is the holiday hourly rate?,PAYSTUB_HOLIDAY_HOURS_RATE
20 | What is the YTD - Child Support?,PAYSTUB_YTD_CHILD_SUPPORT
21 | What is the YTD - Garnishments?,PAYSTUB_YTD_GARNISHMENT
22 | What is the current - Child Support?,PAYSTUB_CURRENT_CHILD_SUPPORT
23 | What is the current - Garnishments?,PAYSTUB_CURRENT_GARNISHMENT
24 | What is the current regular pay?,PAYSTUB_REGULAR_PAY
25 | What is the YTD regular pay?,PAYSTUB_YTD_PAY
26 | 


--------------------------------------------------------------------------------
/python/queries/paystub-questions_subset.csv:
--------------------------------------------------------------------------------
 1 | ﻿What is the Pay Period Start Date?,PAYSTUB_START_DATE
 2 | What is the Pay Period End Date?,PAYSTUB_END_DATE
 3 | What is the Pay Date?,PAYSTUB_PAY_DATE
 4 | What is the Employee Name?,PAYSTUB_EMPLOYEE_NAME
 5 | What is the Employee Address?,PAYSTUB_EMPLOYEE_ADDRESS
 6 | What is the Company Name?,PAYSTUB_EMPLOYER_NAME
 7 | What is the Company Address?,PAYSTUB_EMPLOYER_ADDRESS
 8 | What is the Federal Filing Status?,PAYSTUB_FEDERAL_FILING
 9 | What is the State Filing Status?,PAYSTUB_STATE_FILING
10 | What is the Current Gross Pay?,PAYSTUB_CURRENT_GROSS
11 | What is the YTD Gross Pay?,PAYSTUB_YTD_GROSS
12 | What is the Current Net Pay?,PAYSTUB_CURRENT_NET
13 | What is the YTD Net Pay?,PAYSTUB_YTD_NET
14 | What are the warnings?,PAYSTUB_WARNINGS
15 | What are the Messages?,PAYSTUB_MESSAGES
16 | 


--------------------------------------------------------------------------------
/python/queries/paystub.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/queries/paystub.jpg


--------------------------------------------------------------------------------
/python/queries/vaccination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/queries/vaccination.png


--------------------------------------------------------------------------------
/python/simple-document-image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/simple-document-image.jpg


--------------------------------------------------------------------------------
/python/two-column-image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/two-column-image.jpg


--------------------------------------------------------------------------------
/python/verification-of-employment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/verification-of-employment.png


--------------------------------------------------------------------------------
/src-csharp/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | bin
3 | obj
4 | 
5 | 


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/DetectTextHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using Dotnet_Core.Services;
 3 | 
 4 | namespace Dotnet_Core.ArgHandlers {
 5 | 	internal class DetectTextHandler {
 6 | 		private readonly TextractTextDetectionService textractTextService;
 7 | 
 8 | 		public DetectTextHandler(TextractTextDetectionService textractTextService) {
 9 | 			this.textractTextService = textractTextService;
10 | 		}
11 | 
12 | 		internal void Handle(string localFile) {
13 | 			var localTask = textractTextService.DetectTextLocal(localFile);
14 | 			localTask.Wait();
15 | 			textractTextService.Print(localTask.Result);
16 | 		}
17 | 	}
18 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/DetectTextS3Handler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using Dotnet_Core.Services;
 3 | 
 4 | namespace Dotnet_Core.ArgHandlers {
 5 | 	internal class DetectTextS3Handler {
 6 | 		private readonly TextractTextDetectionService textractTextService;
 7 | 
 8 | 		public DetectTextS3Handler(TextractTextDetectionService textractTextService) {
 9 | 			this.textractTextService = textractTextService;
10 | 		}
11 | 
12 | 		internal void Handle(string bucketName, string s3File) {
13 | 			var s3Task = textractTextService.DetectTextS3(bucketName, s3File);
14 | 			s3Task.Wait();
15 | 			textractTextService.Print(s3Task.Result);
16 | 		}
17 | 	}
18 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/FormsHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using Amazon.Textract.Model;
 3 | using Dotnet_Core.Services;
 4 | 
 5 | namespace Dotnet_Core.ArgHandlers {
 6 | 	internal class FormsHandler {
 7 | 		private readonly TextractTextAnalysisService textractAnalysisService;
 8 | 		public FormsHandler(TextractTextAnalysisService textractAnalysisService) {
 9 | 			this.textractAnalysisService = textractAnalysisService;
10 | 		}
11 | 
12 | 		internal void Handle(string bucketName, string formFile) {
13 | 			var task = textractAnalysisService.StartDocumentAnalysis(bucketName, formFile, "FORMS");
14 | 			var jobId = task.Result;
15 | 			textractAnalysisService.WaitForJobCompletion(jobId);
16 | 			var results = textractAnalysisService.GetJobResults(jobId);
17 | 			var document = new TextractDocument(results);
18 | 			document.Pages.ForEach(page => {
19 | 				page.Form.Fields.ForEach(f => {
20 | 					Console.WriteLine("Key: {0}, Value {1}", f.Key, f.Value);
21 | 				});
22 | 				Console.WriteLine("Get Field by Key:");
23 | 				var key = "Phone Number:";
24 | 				var field = page.Form.GetFieldByKey(key);
25 | 				if(field != null) {
26 | 					Console.WriteLine("Key: {0}, Value: {1}", field.Key, field.Value);
27 | 				}
28 | 			});
29 | 		}
30 | 	}
31 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/FormsRedactionHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Drawing;
 3 | using System.IO;
 4 | using Amazon.Textract.Model;
 5 | using Dotnet_Core.Services;
 6 | 
 7 | namespace Dotnet_Core.ArgHandlers {
 8 | 	internal class FormsRedactionHandler {
 9 | 		private readonly TextractTextAnalysisService textractAnalysisService;
10 | 
11 | 		public FormsRedactionHandler(TextractTextAnalysisService textractAnalysisService) {
12 | 			this.textractAnalysisService = textractAnalysisService;
13 | 		}
14 | 
15 | 		internal void Handle(string bucketName, string formFile, string localFolder, string localFile) {
16 | 			var task = textractAnalysisService.StartDocumentAnalysis(bucketName, formFile, "FORMS");
17 | 			var jobId = task.Result;
18 | 			textractAnalysisService.WaitForJobCompletion(jobId);
19 | 			var results = textractAnalysisService.GetJobResults(jobId);
20 | 
21 | 			var redactableImage = Path.Join(localFolder, "redacted-" + formFile);
22 | 			if(File.Exists(redactableImage))
23 | 				File.Delete(redactableImage);
24 | 			File.Copy(localFile, redactableImage);
25 | 			var image = Image.FromFile(redactableImage);
26 | 			var graphics = Graphics.FromImage(image);
27 | 			var height = image.Height;
28 | 			var width = image.Width;
29 | 			Console.WriteLine("image dimensions: {0}x{1}", width, height);
30 | 
31 | 			var document = new TextractDocument(results);
32 | 			document.Pages.ForEach(page => {
33 | 				page.Form.Fields.ForEach(field => {
34 | 					if(field.Key.Text.ToLower().Contains("address")) {
35 | 						Console.WriteLine("Redacting Key: {0}, Value: {1}", field.Key.Text, field.Value.Text);
36 | 						var bb = field.Value.Geometry.BoundingBox;
37 | 						Console.WriteLine(bb);
38 | 						var x1 = bb.Left * width;
39 | 						var y1 = bb.Top * height - 2;
40 | 						var x2 = bb.Width * width + 2;
41 | 						var y2 = bb.Height * height + 2;
42 | 
43 | 						Console.WriteLine("x1: {0}, x2: {1}, y1: {2}, y2: {3}", x1, x2, y1, y2);
44 | 						graphics.FillRectangle(new SolidBrush(Color.Black), x1, y1, x2, y2);
45 | 						graphics.Save();
46 | 						image.Save(redactableImage);
47 | 						Console.WriteLine("redacted image saved at: {0}", redactableImage);
48 | 					}
49 | 				});
50 | 			});
51 | 		}
52 | 	}
53 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/NlpComprehendHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using Dotnet_Core.Services;
 3 | 
 4 | namespace Dotnet_Core.ArgHandlers {
 5 | 	internal class NlpComprehendHandler {
 6 | 		private readonly TextractTextDetectionService textractTextService;
 7 | 		private readonly ComprehendService comprehendService;
 8 | 
 9 | 		public NlpComprehendHandler(TextractTextDetectionService textractTextService, ComprehendService comprehendService) {
10 | 			this.textractTextService = textractTextService;
11 | 			this.comprehendService = comprehendService;
12 | 		}
13 | 
14 | 		internal void Handle(string localFile) {
15 | 			var localTask = textractTextService.DetectTextLocal(localFile);
16 | 			localTask.Wait();
17 | 			var result = localTask.Result;
18 | 			var lineItems = textractTextService.GetLines(result);
19 | 			var detectSentimentTask = comprehendService.DetectSentiment("en", string.Join("", lineItems));
20 | 			detectSentimentTask.Wait();
21 | 			Console.WriteLine(detectSentimentTask.Result);
22 | 			var detectEntitiesTask = comprehendService.DetectEntities("en", string.Join("", lineItems));
23 | 			detectEntitiesTask.Wait();
24 | 			detectEntitiesTask.Result.ForEach(entity => {
25 | 				Console.WriteLine("{0}:{1}:{2}", entity.Text, entity.Score, entity.Type);
26 | 			});
27 | 		}
28 | 	}
29 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/NlpComprehendMedicalHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using Dotnet_Core.Services;
 3 | 
 4 | namespace Dotnet_Core.ArgHandlers {
 5 | 	internal class NlpComprehendMedicalHandler {
 6 | 		private readonly TextractTextDetectionService textractTextService;
 7 | 		private readonly ComprehendService comprehendMedicalService;
 8 | 
 9 | 		public NlpComprehendMedicalHandler(TextractTextDetectionService textractTextService, ComprehendService comprehendMedicalService) {
10 | 			this.textractTextService = textractTextService;
11 | 			this.comprehendMedicalService = comprehendMedicalService;
12 | 		}
13 | 
14 | 		internal void Handle(string medicalFile) {
15 | 			var localTask = textractTextService.DetectTextLocal(medicalFile);
16 | 			localTask.Wait();
17 | 			var result = localTask.Result;
18 | 			var lineItems = textractTextService.GetLines(result);
19 | 			var medicalTask = comprehendMedicalService.DetectEntities(string.Join("", lineItems));
20 | 			medicalTask.Wait();
21 | 			medicalTask.Result.ForEach(entity => {
22 | 				Console.WriteLine("Text: [{0}], Type: [{1}], Category: [{2}]", entity.Text, entity.Type, entity.Category);
23 | 				entity.Traits.ForEach(trait => {
24 | 					Console.WriteLine(" Trait: [{0}], Score: [{1}]", trait.Name, trait.Score);
25 | 				});
26 | 			});
27 | 		}
28 | 	}
29 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/PdfTextHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using Dotnet_Core.Services;
 3 | 
 4 | namespace Dotnet_Core.ArgHandlers {
 5 | 	internal class PdfTextHandler {
 6 | 		private readonly TextractTextDetectionService textractTextService;
 7 | 
 8 | 		public PdfTextHandler(TextractTextDetectionService textractTextService) {
 9 | 			this.textractTextService = textractTextService;
10 | 		}
11 | 
12 | 		internal void Handle(string bucketName, string pdfFile) {
13 | 			var task = textractTextService.StartDocumentTextDetection(bucketName, pdfFile);
14 | 			var jobId = task.Result;
15 | 			textractTextService.WaitForJobCompletion(jobId);
16 | 			textractTextService.Print(textractTextService.GetJobResults(jobId));
17 | 		}
18 | 	}
19 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/ReadingOrderHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Collections.Generic;
 3 | using Dotnet_Core.Services;
 4 | 
 5 | namespace Dotnet_Core.ArgHandlers {
 6 | 	internal class ReadingOrderHandler {
 7 | 		private readonly TextractTextDetectionService textractTextService;
 8 | 
 9 | 		public ReadingOrderHandler(TextractTextDetectionService textractTextService) {
10 | 			this.textractTextService = textractTextService;
11 | 		}
12 | 
13 | 		internal void Handle(string bucketName, string twoColumnImage) {
14 | 			var task = textractTextService.StartDocumentTextDetection(bucketName, twoColumnImage);
15 | 			var jobId = task.Result;
16 | 			textractTextService.WaitForJobCompletion(jobId);
17 | 			var jobResults = textractTextService.GetJobResults(jobId);
18 | 			var lines = new List<IndexedText>();
19 | 			var columns = new List<Column>();
20 | 			jobResults.ForEach(job => {
21 | 				job.Blocks.ForEach(block => {
22 | 					if(block.BlockType == "LINE") {
23 | 						var columnFound = false;
24 | 						for(var index = 0; index < columns.Count; index++) {
25 | 							var column = columns[index];
26 | 							var bb = block.Geometry.BoundingBox;
27 | 							var bbLeft = bb.Left;
28 | 							var bbRight = bb.Left + bb.Width;
29 | 							var bbCentre = bb.Left + (bb.Width / 2);
30 | 							var columnCentre = column.Left + (column.Right / 2);
31 | 
32 | 							if((bbCentre > column.Left && bbCentre < column.Right) || (columnCentre > bbLeft && columnCentre < bbRight)) {
33 | 								lines.Add(new IndexedText { ColumnIndex = index, Text = block.Text });
34 | 								columnFound = true;
35 | 								break;
36 | 							}
37 | 						}
38 | 						if(!columnFound) {
39 | 							var bb = block.Geometry.BoundingBox;
40 | 							columns.Add(new Column { Left = bb.Left, Right = bb.Left + bb.Width });
41 | 							lines.Add(new IndexedText { ColumnIndex = columns.Count - 1, Text = block.Text });
42 | 						}
43 | 					}
44 | 				});
45 | 				lines.FindAll(line => line.ColumnIndex == 0).ForEach(line => Console.WriteLine(line));
46 | 			});
47 | 		}
48 | 	}
49 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/SearchHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using Amazon.Textract.Model;
 3 | using Dotnet_Core.Services;
 4 | 
 5 | namespace Dotnet_Core.ArgHandlers {
 6 | 	internal class SearchHandler {
 7 | 		private readonly TextractTextDetectionService textractTextService;
 8 | 		private readonly ElasticSearchService elasticSearchService;
 9 | 
10 | 		public SearchHandler(TextractTextDetectionService textractTextService, ElasticSearchService elasticSearchService) {
11 | 			this.textractTextService = textractTextService;
12 | 			this.elasticSearchService = elasticSearchService;
13 | 		}
14 | 
15 | 		internal void Handle(string bucketName, string s3File) {
16 | 			var detectTextTask = textractTextService.DetectTextS3(bucketName, s3File);
17 | 			detectTextTask.Wait();
18 | 			var result = detectTextTask.Result;
19 | 			textractTextService.Print(result);
20 | 			elasticSearchService.Index<DetectDocumentTextResponse>(result, "sample-index");
21 | 			Console.WriteLine("Index complete");
22 | 		}
23 | 	}
24 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/TablesExpenseHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Collections.Generic;
 3 | using System.Text;
 4 | using Amazon.Textract.Model;
 5 | using Dotnet_Core.Services;
 6 | 
 7 | namespace Dotnet_Core.ArgHandlers {
 8 | 	internal class TablesExpenseHandler {
 9 | 		private readonly TextractTextAnalysisService textractAnalysisService;
10 | 
11 | 		public TablesExpenseHandler(TextractTextAnalysisService textractAnalysisService) {
12 | 			this.textractAnalysisService = textractAnalysisService;
13 | 		}
14 | 
15 | 		internal void Handle(string bucketName, string expenseFile) {
16 | 			var task = textractAnalysisService.StartDocumentAnalysis(bucketName, expenseFile, "TABLES");
17 | 			var jobId = task.Result;
18 | 			textractAnalysisService.WaitForJobCompletion(jobId);
19 | 			var results = textractAnalysisService.GetJobResults(jobId);
20 | 			var warnings = new StringBuilder();
21 | 			float expense;
22 | 			var lineItem = new List<string>();
23 | 			var document = new TextractDocument(results);
24 | 			document.Pages.ForEach(page => {
25 | 				page.Tables.ForEach(table => {
26 | 					var r = 0;
27 | 					table.Rows.ForEach(row => {
28 | 						r++;
29 | 						var itemName = string.Empty;
30 | 						var c = 0;
31 | 						row.Cells.ForEach(cell => {
32 | 							c++;
33 | 							Console.WriteLine("Table [{0}][{1}] = {2}", r, c, cell.Text);
34 | 							if(c == 1) {
35 | 								itemName = cell.Text;
36 | 							} else if(c == 5 && float.TryParse(cell.Text, out expense)) {
37 | 								if(expense > 100) {
38 | 									warnings.AppendFormat("{0} is greater than $100{1}", itemName, Environment.NewLine);
39 | 								}
40 | 							}
41 | 						});
42 | 					});
43 | 				});
44 | 			});
45 | 			Console.WriteLine(string.Format("{0}===Warnings==={0}{1}===", Environment.NewLine, warnings));
46 | 		}
47 | 	}
48 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/TablesHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using Amazon.Textract.Model;
 3 | using Dotnet_Core.Services;
 4 | 
 5 | namespace Dotnet_Core.ArgHandlers {
 6 | 	internal class TablesHandler {
 7 | 		private readonly TextractTextAnalysisService textractAnalysisService;
 8 | 
 9 | 		public TablesHandler(TextractTextAnalysisService textractAnalysisService) {
10 | 			this.textractAnalysisService = textractAnalysisService;
11 | 		}
12 | 
13 | 		internal void Handle(string bucketName, string formFile) {
14 | 			var task = textractAnalysisService.StartDocumentAnalysis(bucketName, formFile, "TABLES");
15 | 			var jobId = task.Result;
16 | 			textractAnalysisService.WaitForJobCompletion(jobId);
17 | 			var results = textractAnalysisService.GetJobResults(jobId);
18 | 			var document = new TextractDocument(results);
19 | 			document.Pages.ForEach(page => {
20 | 				page.Tables.ForEach(table => {
21 | 					var r = 0;
22 | 					table.Rows.ForEach(row => {
23 | 						r++;
24 | 						var c = 0;
25 | 						row.Cells.ForEach(cell => {
26 | 							c++;
27 | 							Console.WriteLine("Table [{0}][{1}] = {2}", r, c, cell.Text);
28 | 						});
29 | 					});
30 | 				});
31 | 			});
32 | 		}
33 | 	}
34 | }


--------------------------------------------------------------------------------
/src-csharp/ArgHandlers/TranslateHandler.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Text;
 3 | using Dotnet_Core.Services;
 4 | 
 5 | namespace Dotnet_Core.ArgHandlers {
 6 | 	internal class TranslateHandler {
 7 | 		private readonly TextractTextDetectionService textractTextService;
 8 | 		private readonly TranslateService translateService;
 9 | 
10 | 		public TranslateHandler(TextractTextDetectionService textractTextService, TranslateService translateService) {
11 | 			this.textractTextService = textractTextService;
12 | 			this.translateService = translateService;
13 | 		}
14 | 
15 | 		internal void Handle(string bucketName, string s3File) {
16 | 			var detectTextTask = textractTextService.DetectTextS3(bucketName, s3File);
17 | 			detectTextTask.Wait();
18 | 			var blocks = detectTextTask.Result.Blocks;
19 | 			var sourceText = new StringBuilder();
20 | 			blocks.ForEach(x => {
21 | 				if(x.BlockType == "LINE") {
22 | 					sourceText.AppendLine(x.Text);
23 | 				}
24 | 			});
25 | 			Console.WriteLine(sourceText.ToString());
26 | 			var translateTask = translateService.TranslateText(sourceText.ToString(), "en", "de");
27 | 			translateTask.Wait();
28 | 			Console.WriteLine(translateTask.Result.TranslatedText);
29 | 		}
30 | 	}
31 | }


--------------------------------------------------------------------------------
/src-csharp/Program.cs:
--------------------------------------------------------------------------------
  1 | using System;
  2 | using Amazon.Textract;
  3 | using Amazon.Translate;
  4 | using Microsoft.Extensions.Configuration;
  5 | using Amazon.Comprehend;
  6 | using Amazon.ComprehendMedical;
  7 | using Dotnet_Core.Services;
  8 | using Dotnet_Core.ArgHandlers;
  9 | 
 10 | namespace Dotnet_Core {
 11 | 	partial class Program {
 12 | 
 13 | 		const string BucketName = "textract-console-us-west-2-d92b0df4-a50a-4203-b070-044c3ee7fe83";
 14 | 		const string LocalEmploymentFile = "test-files/employmentapp.png";
 15 | 		const string LocalSimpleFile = "test-files/simple-document-image.jpg";
 16 | 		const string LocalMedicalFile = "test-files/medical-notes.png";
 17 | 		const string LocalFolder = "test-files";
 18 | 		const string S3File = "simple-document-image.jpg";
 19 | 		const string TwoColumnImage = "two-column-image.jpg";
 20 | 		const string PdfFile = "Amazon-Textract-Pdf.pdf";
 21 | 		const string FormFile = "employmentapp.png";
 22 | 		const string ExpenseFile = "expense.png";
 23 | 		const string ElasticSearchEndpoint = "https://search-textract-sample-hvthzep6bedgfdj6oxeng5jtmi.us-west-2.es.amazonaws.com";
 24 | 		const string ElasticSearchDomainName = "textract-sample";
 25 | 
 26 | 		static void Main(string[] args) {
 27 | 			if(args.Length == 0) {
 28 | 				Console.WriteLine(HelpText);
 29 | 				return;
 30 | 			}
 31 | 
 32 | 			var firstArg = args[0];
 33 | 
 34 | 			var builder = new ConfigurationBuilder()
 35 | 				.SetBasePath(Environment.CurrentDirectory)
 36 | 				.AddJsonFile("appsettings.json", optional: false, reloadOnChange: true)
 37 | 				.AddEnvironmentVariables()
 38 | 				.Build();
 39 | 			var awsOptions = builder.GetAWSOptions();
 40 | 			Console.WriteLine(awsOptions.Profile + ":" + awsOptions.ProfilesLocation + ": " + awsOptions.Region.DisplayName);
 41 | 			var textractTextService = new TextractTextDetectionService(awsOptions.CreateServiceClient<IAmazonTextract>());
 42 | 			var textractAnalysisService = new TextractTextAnalysisService(awsOptions.CreateServiceClient<IAmazonTextract>());
 43 | 			var translateService = new TranslateService(awsOptions.CreateServiceClient<IAmazonTranslate>());
 44 | 			var comprehendService = new ComprehendService(awsOptions.CreateServiceClient<IAmazonComprehend>());
 45 | 			var comprehendMedicalService = new ComprehendService(awsOptions.CreateServiceClient<IAmazonComprehendMedical>());
 46 | 			var elasticSearchService = new ElasticSearchService(ElasticSearchEndpoint, ElasticSearchDomainName);
 47 | 
 48 | 			switch(firstArg) {
 49 | 				case "--detect-text-local":
 50 | 					new DetectTextHandler(textractTextService).Handle(LocalEmploymentFile);
 51 | 					break;
 52 | 				case "--detect-text-s3":
 53 | 					new DetectTextS3Handler(textractTextService).Handle(BucketName, S3File);
 54 | 					break;
 55 | 				case "--pdf-text":
 56 | 					new PdfTextHandler(textractTextService).Handle(BucketName, PdfFile);
 57 | 					break;
 58 | 				case "--reading-order":
 59 | 					new ReadingOrderHandler(textractTextService).Handle(BucketName, TwoColumnImage);
 60 | 					break;
 61 | 				case "--translate":
 62 | 					new TranslateHandler(textractTextService, translateService).Handle(BucketName, S3File);
 63 | 					break;
 64 | 				case "--search":
 65 | 					new SearchHandler(textractTextService, elasticSearchService).Handle(BucketName, S3File);
 66 | 					break;
 67 | 				case "--forms":
 68 | 					new FormsHandler(textractAnalysisService).Handle(BucketName, FormFile);
 69 | 					break;
 70 | 				case "--forms-redaction":
 71 | 					new FormsRedactionHandler(textractAnalysisService).Handle(BucketName, FormFile, LocalFolder, LocalEmploymentFile);
 72 | 					break;
 73 | 				case "--tables":
 74 | 					new TablesHandler(textractAnalysisService).Handle(BucketName, FormFile);
 75 | 					break;
 76 | 				case "--tables-expense":
 77 | 					new TablesExpenseHandler(textractAnalysisService).Handle(BucketName, ExpenseFile);
 78 | 					break;
 79 | 				case "--nlp-comprehend":
 80 | 					new NlpComprehendHandler(textractTextService, comprehendService).Handle(LocalSimpleFile);
 81 | 					break;
 82 | 				case "--nlp-medical":
 83 | 					new NlpComprehendMedicalHandler(textractTextService, comprehendMedicalService).Handle(LocalMedicalFile);
 84 | 					break;
 85 | 				default:
 86 | 					Console.WriteLine(HelpText);
 87 | 					break;
 88 | 			}
 89 | 		}
 90 | 
 91 | 		const string HelpText = @"
 92 |             Usage: dotnet run [--switch]
 93 |             To run this console app, use the following valid switches one at a time:
 94 | 				--detect-text-local
 95 | 				--detect-text-s3
 96 | 				--pdf-text
 97 | 				--forms
 98 | 				--forms-redaction
 99 | 				--tables
100 | 				--tables-expense
101 | 				--reading-order
102 | 				--nlp-comprehend
103 | 				--nlp-medical
104 | 				--translate
105 | 				--search
106 |                 e.g. dotnet run --detect-text-s3
107 |             ";
108 | 	}
109 | }


--------------------------------------------------------------------------------
/src-csharp/Readme.md:
--------------------------------------------------------------------------------
  1 | # C# .NET Core implementation
  2 | 
  3 | Amazon Textract samples for .NET Core with C#
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | - [Dotnet Core 2.2](https://dotnet.microsoft.com/download/dotnet-core/2.2)
  8 | - [AWS CLI](https://docs.aws.amazon.com/polly/latest/dg/setup-aws-cli.html) for
  9 |   running AWS CLI commands after configuring a
 10 |   [default or named profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
 11 | - Upload files from `test-files` to a target Amazon S3 bucket in your account
 12 | 
 13 | ## Usage
 14 | 
 15 | ```
 16 | Usage: dotnet run [--switch]
 17 | To run this console app, use the following valid switches one at a time:
 18 |                      --detect-text-local
 19 |                      --detect-text-s3
 20 |                      --pdf-text
 21 |                      --forms
 22 |                      --forms-redaction
 23 |                      --tables
 24 |                      --tables-expense
 25 |                      --reading-order
 26 |                      --nlp-comprehend
 27 |                      --nlp-medical
 28 |                      --translate
 29 |                      --search
 30 |       e.g. dotnet run --detect-text-s3
 31 | ```
 32 | 
 33 | ## Samples
 34 | 
 35 | | Argument            | Description                                                |
 36 | | ------------------- | ---------------------------------------------------------- |
 37 | | --detect-text-local | Example showing processing a document on local machine.    |
 38 | | --detect-text-s3    | Example showing processing a document in Amazon S3 bucket. |
 39 | | --pdf-text          | Example showing PDF document processing.                   |
 40 | | --forms             | Example showing form (key/value) processing.               |
 41 | | --forms-redaction   | Example showing redacting information in document.         |
 42 | | --tables            | Example showing table processing.                          |
 43 | | --tables-expense    | Example showing validation of table data.                  |
 44 | | --reading-order     | Example showing printing document in reading order.        |
 45 | | --nlp-comprehend    | Example showing detecting entities and sentiment.          |
 46 | | --nlp-medical       | Example showing detecting medical entities.                |
 47 | | --translate         | Example showing translation of documents.                  |
 48 | | --search            | Example showing document indexing in Elasticsearch.        |
 49 | 
 50 | Example usage and result
 51 | 
 52 | ```
 53 | dotnet-core sanjeet$ dotnet run --forms
 54 | default:: US West (Oregon)
 55 | ........Key: Phone Number:, Value 555-0100
 56 | Key: Full Name:, Value Jane Doe
 57 | Key: Home Address:, Value 123 Any Street. Any Town, USA
 58 | Key: Mailing Address:, Value same as home address
 59 | Get Field by Key:
 60 | Key: Phone Number:, Value: 555-0100
 61 | ```
 62 | 
 63 | The following source document was used by the example above to analyze Form
 64 | data. This document has a Form and a Table on it:
 65 | 
 66 | ![source document](test-files/employmentapp.png)
 67 | 
 68 | The following AWS services are used:
 69 | 
 70 | - Amazon Textract (for text extraction and analysis)
 71 | - Amazon Comprehend (for natural language processing)
 72 | - Amazon Comprehend Medical (for natural language processing of medical
 73 |   prescriptions/documents)
 74 | - Amazon Elasticsearch (for full text indexing and search)
 75 | - Amazon S3 (for storing scanned documents/images used by Amazon Textract)
 76 | - Amazon Translate (for translating text from English to other supported
 77 |   languages)
 78 | 
 79 | ## Dependencies
 80 | 
 81 | appsettings.json file uses your default AWS profile so that you don't have to
 82 | set AWS credentials in clear text
 83 | 
 84 | ```
 85 | {
 86 |   "AWS": {
 87 |     "Profile": "default",
 88 |     "Region": "us-west-2"
 89 |   }
 90 | }
 91 | ```
 92 | 
 93 | ### A quick walkthrough of the .csproj file
 94 | 
 95 | dotnet-core.csproj file: required .NET libraries - these libraries will be
 96 | auto-installed as part of the build process
 97 | 
 98 | ```
 99 | <ItemGroup>
100 |    <PackageReference Include="AWSSDK.Comprehend" Version="3.3.104.13" />
101 |    <PackageReference Include="AWSSDK.ComprehendMedical" Version="3.3.100.31" />
102 |    <PackageReference Include="AWSSDK.Extensions.NETCore.Setup" Version="3.3.100.1" />
103 |    <PackageReference Include="AWSSDK.S3" Version="3.3.102.12" />
104 |    <PackageReference Include="AWSSDK.Textract" Version="3.3.101.23" />
105 |    <PackageReference Include="AWSSDK.Translate" Version="3.3.100.28" />
106 |    <PackageReference Include="Microsoft.Extensions.Configuration" Version="2.2.0" />
107 |    <PackageReference Include="Microsoft.Extensions.Configuration.EnvironmentVariables" Version="2.2.4" />
108 |    <PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="2.2.0" />
109 |    <PackageReference Include="NEST" Version="6.8.0" />
110 |    <PackageReference Include="System.Drawing.Common" Version="4.5.1" />
111 | </ItemGroup>
112 | ```
113 | 
114 | The code sample that performs redaction on a form uses Dotnet Core
115 | System.Drawing.Commons package. You can add System.Drawing.Common package to the
116 | project by using the following dotnet CLI command
117 | 
118 | ```
119 | dotnet add package System.Drawing.Common --version 4.5.1
120 | ```
121 | 
122 | [NEST](https://github.com/elastic/elasticsearch-net) is the official
123 | Elasticsearch client that's used by this sample to send text for indexing in an
124 | Amazon Elasticsearch domain provisioned in AWS. Use the following command to
125 | install NEST
126 | 
127 | ```
128 | dotnet add package NEST --version 6.8.0
129 | ```
130 | 
131 | AWSSDK.\* packages are Nuget client libraries, and can be installed using a
132 | command similar to the following
133 | 
134 | ```
135 | dotnet add package AWSSDK.<package-name>
136 | ```
137 | 
138 | dotnet-core.csproj file: "test-files" folder has all the required files e.g.
139 | pdf, jpg, and png used for testing and they are all copied to the output
140 | directory
141 | 
142 | ```
143 | <ItemGroup>
144 |    <None Update="appsettings.json">
145 |    <CopyToOutputDirectory>Always</CopyToOutputDirectory>
146 |    </None>
147 | </ItemGroup>
148 | 
149 | <ItemGroup>
150 |    <None Update="test-files\*">
151 |    <CopyToOutputDirectory>Always</CopyToOutputDirectory>
152 |    </None>
153 | </ItemGroup>
154 | ```
155 | 
156 | To recursively copy files from local disk to S3 use the following command
157 | 
158 | ```
159 | aws s3 cp dotnet-core/test-files s3://<Your-S3-bucket>/ --include "*" --recursive
160 | ```
161 | 
162 | ## Pro tips
163 | 
164 | If you ever encounter the following error while running this .NET core
165 | application in MacOS
166 | 
167 | ```
168 | Unhandled Exception: System.TypeInitializationException: The type initializer for 'Gdip' threw an exception. ---> System.DllNotFoundException: Unable to load DLL 'libgdiplus': The specified module could not be found.
169 |    at System.Runtime.InteropServices.FunctionWrapper`1.get_Delegate()
170 |    at System.Drawing.SafeNativeMethods.Gdip.GdiplusStartup(IntPtr& token, StartupInput& input, StartupOutput& output)
171 |    at System.Drawing.SafeNativeMethods.Gdip..cctor()
172 |    --- End of inner exception stack trace ---
173 | ```
174 | 
175 | Install this package
176 | 
177 | ```
178 | brew install mono-libgdiplus
179 | ```
180 | 
181 | and if you see the following error
182 | 
183 | ```
184 | Unhandled Exception: System.ArgumentException: Parameter is not valid.
185 |    at System.Drawing.Graphics.DrawRectangle(Pen pen, Int32 x, Int32 y, Int32 width, Int32 height)
186 | ```
187 | 
188 | ensure that the following is commented out (grpahics routine is disposed even
189 | before it gets an opportunity to DrawRectangle that's why you get the error)
190 | 
191 | ```
192 | // graphics.Dispose();
193 | // image.Dispose();
194 | ```
195 | 


--------------------------------------------------------------------------------
/src-csharp/Services/Column.cs:
--------------------------------------------------------------------------------
 1 | namespace Dotnet_Core.Services {
 2 | 	internal class Column {
 3 | 		public float Left { get; set; }
 4 | 		public float Right { get; set; }
 5 | 
 6 | 		public override string ToString() {
 7 | 			return string.Format("Left: {0}, Right :{1}", this.Left, this.Right);
 8 | 		}
 9 | 	}
10 | }


--------------------------------------------------------------------------------
/src-csharp/Services/ComprehendService.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Collections.Generic;
 3 | using System.Threading.Tasks;
 4 | using Amazon.Comprehend;
 5 | using Amazon.Comprehend.Model;
 6 | using Amazon.ComprehendMedical;
 7 | using MedicalModel = Amazon.ComprehendMedical.Model;
 8 | 
 9 | namespace Dotnet_Core.Services {
10 | 	public class ComprehendService {
11 | 		private IAmazonComprehend comprehend { get; }
12 | 		private IAmazonComprehendMedical comprehendMedical { get; }
13 | 		public ComprehendService(IAmazonComprehend comprehend) {
14 | 			this.comprehend = comprehend;
15 | 		}
16 | 
17 | 		public ComprehendService(IAmazonComprehendMedical comprehend) {
18 | 			this.comprehendMedical = comprehend;
19 | 		}
20 | 
21 | 		public async Task<DetectedSentiment> DetectSentiment(string languageCode, string text) {
22 | 			var task = await this.comprehend.DetectSentimentAsync(new DetectSentimentRequest {
23 | 				LanguageCode = languageCode,
24 | 				Text = text
25 | 			});
26 | 			return new DetectedSentiment {
27 | 				Sentiment = task.Sentiment,
28 | 				Mixed = task.SentimentScore.Mixed,
29 | 				Neutral = task.SentimentScore.Neutral,
30 | 				Negative = task.SentimentScore.Negative,
31 | 				Positive = task.SentimentScore.Positive
32 | 			};
33 | 		}
34 | 
35 | 		public async Task<List<Entity>> DetectEntities(string languageCode, string text) {
36 | 			var task = await this.comprehend.DetectEntitiesAsync(new DetectEntitiesRequest {
37 | 				LanguageCode = languageCode,
38 | 				Text = text
39 | 			});
40 | 			return task.Entities;
41 | 		}
42 | 
43 | 		public async Task<List<MedicalModel.Entity>> DetectEntities(string text) {
44 | 			var task = await this.comprehendMedical.DetectEntitiesAsync(new MedicalModel.DetectEntitiesRequest {
45 | 				Text = text
46 | 			});
47 | 			return task.Entities;
48 | 		}
49 | 
50 | 		public class DetectedSentiment {
51 | 			public string Sentiment { get; set; }
52 | 			public float Mixed { get; set; }
53 | 			public float Positive { get; set; }
54 | 			public float Neutral { get; set; }
55 | 			public float Negative { get; set; }
56 | 
57 | 			public override string ToString() {
58 | 				return string.Format("Sentiment: {0}, Score: [Mixed: {1}, Positive: {2}, Negative: {3}, Neutral: {4}]", this.Sentiment, this.Mixed, this.Positive, this.Negative, this.Neutral);
59 | 			}
60 | 
61 | 		}
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/src-csharp/Services/ElasticSearchService.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using Nest;
 3 | 
 4 | namespace Dotnet_Core.Services {
 5 | 	public class ElasticSearchService {
 6 | 		// private string domainUri, defaultIndex;
 7 | 		private ElasticClient elasticClient;
 8 | 		public ElasticSearchService(string endpoint, string domainName) {
 9 | 			var connectionSettings = new ConnectionSettings(new Uri(endpoint));
10 | 			connectionSettings.DefaultIndex(domainName);
11 | 			this.elasticClient = new ElasticClient(connectionSettings);
12 | 		}
13 | 
14 | 		public void Index<T>(T item, string indexName) where T : class {
15 | 			this.elasticClient.Index<T>(item, x => x.Index(indexName));
16 | 		}
17 | 	}
18 | }


--------------------------------------------------------------------------------
/src-csharp/Services/IndexedText.cs:
--------------------------------------------------------------------------------
 1 | namespace Dotnet_Core.Services {
 2 | 	internal class IndexedText {
 3 | 		public int ColumnIndex { get; set; }
 4 | 		public string Text { get; set; }
 5 | 
 6 | 		public override string ToString() {
 7 | 			return string.Format("[{0}] {1}", this.ColumnIndex, this.Text);
 8 | 		}
 9 | 	}
10 | }


--------------------------------------------------------------------------------
/src-csharp/Services/TextractTextAnalysisService.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Collections.Generic;
 3 | using System.Threading.Tasks;
 4 | using Amazon.Textract;
 5 | using Amazon.Textract.Model;
 6 | 
 7 | namespace Dotnet_Core.Services {
 8 | 	public class TextractTextAnalysisService {
 9 | 		private IAmazonTextract textract;
10 | 		public TextractTextAnalysisService(IAmazonTextract textract) {
11 | 			this.textract = textract;
12 | 		}
13 | 		public GetDocumentAnalysisResponse GetJobResults(string jobId) {
14 | 			var response = this.textract.GetDocumentAnalysisAsync(new GetDocumentAnalysisRequest {
15 | 				JobId = jobId
16 | 			});
17 | 			response.Wait();
18 | 			return response.Result;
19 | 		}
20 | 
21 | 		public bool IsJobComplete(string jobId) {
22 | 			var response = this.textract.GetDocumentAnalysisAsync(new GetDocumentAnalysisRequest {
23 | 				JobId = jobId
24 | 			});
25 | 			response.Wait();
26 | 			return !response.Result.JobStatus.Equals("IN_PROGRESS");
27 | 		}
28 | 
29 | 		public async Task<string> StartDocumentAnalysis(string bucketName, string key, string featureType) {
30 | 			var request = new StartDocumentAnalysisRequest();
31 | 			var s3Object = new S3Object {
32 | 				Bucket = bucketName,
33 | 				Name = key
34 | 			};
35 | 			request.DocumentLocation = new DocumentLocation {
36 | 				S3Object = s3Object
37 | 			};
38 | 			request.FeatureTypes = new List<string> { featureType };
39 | 			var response = await this.textract.StartDocumentAnalysisAsync(request);
40 | 			return response.JobId;
41 | 		}
42 | 
43 | 		public void WaitForJobCompletion(string jobId, int delay = 5000) {
44 | 			while(!IsJobComplete(jobId)) {
45 | 				this.Wait(delay);
46 | 			}
47 | 		}
48 | 
49 | 		private void Wait(int delay = 5000) {
50 | 			Task.Delay(delay).Wait();
51 | 			Console.Write(".");
52 | 		}
53 | 
54 | 		public void PrintDebug(GetDocumentAnalysisResponse response) {
55 | 			response.Blocks.ForEach(y => {
56 | 				Console.WriteLine("<block>");
57 | 				Console.WriteLine(y.Id + ":" + y.BlockType + ":" + y.Text);
58 | 				if(y.BlockType == "KEY_VALUE_SET") {
59 | 					Console.WriteLine(" <KEY_VALUE_SET>");
60 | 					PrintBlock(y);
61 | 					Console.WriteLine(" </KEY_VALUE_SET>");
62 | 				} else if(y.BlockType == "TABLE") {
63 | 					Console.WriteLine(" <TABLE>");
64 | 					PrintBlock(y);
65 | 					Console.WriteLine(" </TABLE>");
66 | 				} else if(y.BlockType == "CELL") {
67 | 					Console.WriteLine(" <CELL>");
68 | 					PrintBlock(y);
69 | 					Console.WriteLine(" </CELL>");
70 | 				}
71 | 				Console.WriteLine("</block>");
72 | 			});
73 | 		}
74 | 		private void PrintBlock(Block block) {
75 | 			Console.WriteLine("  <entity>");
76 | 			block.EntityTypes.ForEach(z => Console.WriteLine("   " + z));
77 | 			Console.WriteLine("  </entity>");
78 | 			block.Relationships.ForEach(z => {
79 | 				Console.WriteLine("  <relation>");
80 | 				Console.WriteLine("   " + z.Type);
81 | 				Console.WriteLine("   <id>");
82 | 				z.Ids.ForEach(a => Console.WriteLine("    " + a));
83 | 				Console.WriteLine("   </id>");
84 | 				Console.WriteLine("  </relation>");
85 | 			});
86 | 		}
87 | 	}
88 | }


--------------------------------------------------------------------------------
/src-csharp/Services/TextractTextDetectionService.cs:
--------------------------------------------------------------------------------
  1 | using System;
  2 | using System.Collections.Generic;
  3 | using System.IO;
  4 | using System.Text;
  5 | using System.Threading.Tasks;
  6 | using Amazon.Textract;
  7 | using Amazon.Textract.Model;
  8 | 
  9 | namespace Dotnet_Core.Services {
 10 | 	public class TextractTextDetectionService {
 11 | 		private IAmazonTextract textract;
 12 | 		public TextractTextDetectionService(IAmazonTextract textract) {
 13 | 			this.textract = textract;
 14 | 		}
 15 | 
 16 | 		public async Task<string> StartDocumentTextDetection(string bucketName, string key) {
 17 | 			var request = new StartDocumentTextDetectionRequest();
 18 | 			request.DocumentLocation = new DocumentLocation {
 19 | 				S3Object = new S3Object {
 20 | 					Bucket = bucketName,
 21 | 					Name = key
 22 | 				}
 23 | 			};
 24 | 			var response = await this.textract.StartDocumentTextDetectionAsync(request);
 25 | 			return response.JobId;
 26 | 		}
 27 | 
 28 | 		public async Task<DetectDocumentTextResponse> DetectTextLocal(string localPath) {
 29 | 			var result = new DetectDocumentTextResponse();
 30 | 
 31 | 			if(File.Exists(localPath)) {
 32 | 				var request = new DetectDocumentTextRequest();
 33 | 				request.Document = new Document {
 34 | 					Bytes = new MemoryStream(File.ReadAllBytes(localPath))
 35 | 				};
 36 | 				return await this.textract.DetectDocumentTextAsync(request);
 37 | 			}
 38 | 			Console.WriteLine("File: " + localPath + " doesn't exist");
 39 | 			return result;
 40 | 		}
 41 | 
 42 | 		public void WaitForJobCompletion(string jobId, int delay = 5000) {
 43 | 			while(!IsJobComplete(jobId)) {
 44 | 				this.Wait(delay);
 45 | 			}
 46 | 		}
 47 | 
 48 | 		public bool IsJobComplete(string jobId) {
 49 | 			var response = this.textract.GetDocumentTextDetectionAsync(new GetDocumentTextDetectionRequest {
 50 | 				JobId = jobId
 51 | 			});
 52 | 			response.Wait();
 53 | 			return !response.Result.JobStatus.Equals("IN_PROGRESS");
 54 | 		}
 55 | 
 56 | 		public List<GetDocumentTextDetectionResponse> GetJobResults(string jobId) {
 57 | 			var result = new List<GetDocumentTextDetectionResponse>();
 58 | 			var response = this.textract.GetDocumentTextDetectionAsync(new GetDocumentTextDetectionRequest {
 59 | 				JobId = jobId
 60 | 			});
 61 | 			response.Wait();
 62 | 			result.Add(response.Result);
 63 | 			var nextToken = response.Result.NextToken;
 64 | 			while(nextToken != null) {
 65 | 				this.Wait();
 66 | 				response = this.textract.GetDocumentTextDetectionAsync(new GetDocumentTextDetectionRequest {
 67 | 					JobId = jobId,
 68 | 					NextToken = response.Result.NextToken
 69 | 				});
 70 | 				response.Wait();
 71 | 				result.Add(response.Result);
 72 | 				nextToken = response.Result.NextToken;
 73 | 			}
 74 | 			return result;
 75 | 		}
 76 | 
 77 | 		private void Wait(int delay = 5000) {
 78 | 			Task.Delay(delay).Wait();
 79 | 			Console.Write(".");
 80 | 		}
 81 | 
 82 | 		public async Task<DetectDocumentTextResponse> DetectTextS3(string bucketName, string key) {
 83 | 			var result = new DetectDocumentTextResponse();
 84 | 			var s3Object = new S3Object {
 85 | 				Bucket = bucketName,
 86 | 				Name = key
 87 | 			};
 88 | 			var request = new DetectDocumentTextRequest();
 89 | 			request.Document = new Document {
 90 | 				S3Object = s3Object
 91 | 			};
 92 | 			return await this.textract.DetectDocumentTextAsync(request);
 93 | 		}
 94 | 
 95 | 		private void Print(List<Block> blocks) {
 96 | 			blocks.ForEach(x => {
 97 | 				if(x.BlockType.Equals("LINE")) {
 98 | 					Console.WriteLine(x.Text);
 99 | 				}
100 | 			});
101 | 		}
102 | 
103 | 		public void Print(DetectDocumentTextResponse response) {
104 | 			if(response != null) {
105 | 				this.Print(response.Blocks);
106 | 			}
107 | 		}
108 | 
109 | 		public void Print(List<GetDocumentTextDetectionResponse> response) {
110 | 			if(response != null && response.Count > 0) {
111 | 				response.ForEach(r => this.Print(r.Blocks));
112 | 			}
113 | 		}
114 | 
115 | 		public List<string> GetLines(DetectDocumentTextResponse result) {
116 | 			var lines = new List<string>();
117 | 			result.Blocks.FindAll(block => block.BlockType == "LINE").ForEach(block => lines.Add(block.Text));
118 | 			return lines;
119 | 		}
120 | 	}
121 | }


--------------------------------------------------------------------------------
/src-csharp/Services/TranslateService.cs:
--------------------------------------------------------------------------------
 1 | using System.Threading.Tasks;
 2 | using Amazon.Translate;
 3 | using Amazon.Translate.Model;
 4 | 
 5 | namespace Dotnet_Core.Services {
 6 | 	public class TranslateService {
 7 | 		private IAmazonTranslate translate;
 8 | 		public TranslateService(IAmazonTranslate translate) {
 9 | 			this.translate = translate;
10 | 		}
11 | 
12 | 		public async Task<TranslateTextResponse> TranslateText(string text, string sourceLanguage, string targetLanguage) {
13 | 			var request = new TranslateTextRequest {
14 | 				SourceLanguageCode = sourceLanguage,
15 | 				TargetLanguageCode = targetLanguage,
16 | 				Text = text
17 | 			};
18 | 
19 | 			return await this.translate.TranslateTextAsync(request);
20 | 		}
21 | 	}
22 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/Cell.cs:
--------------------------------------------------------------------------------
 1 | using System.Collections.Generic;
 2 | using Amazon.Textract.Model;
 3 | 
 4 | namespace Amazon.Textract.Model {
 5 | 	public class Cell {
 6 | 		public Cell(Block block, Dictionary<string, Block> blocks) {
 7 |             if(block == null)
 8 |                 return;
 9 | 			this.Block = block;
10 | 			this.ColumnIndex = block.ColumnIndex;
11 | 			this.ColumnSpan = block.ColumnSpan;
12 | 			this.Confidence = block.Confidence;
13 | 			this.Content = new List<dynamic>();
14 | 			this.Geometry = block.Geometry;
15 | 			this.Id = block.Id;
16 | 			this.RowIndex = block.RowIndex;
17 | 			this.RowSpan = block.RowSpan;
18 | 			this.Text = string.Empty;
19 | 
20 | 			var relationships = block.Relationships;
21 | 			if(relationships != null && relationships.Count > 0) {
22 | 				relationships.ForEach(r => {
23 | 					if(r.Type == "CHILD") {
24 | 						r.Ids.ForEach(id => {
25 | 							var rb = blocks[id];
26 |                             if(rb != null && rb.BlockType == "WORD") {
27 | 								var w = new Word(rb, blocks);
28 | 								this.Content.Add(w);
29 | 								this.Text = this.Text + w.Text + " ";
30 | 							} else if(rb != null && rb.BlockType == "SELECTION_ELEMENT") {
31 | 								var se = new SelectionElement(rb, blocks);
32 | 								this.Content.Add(se);
33 | 								this.Text = this.Text + se.SelectionStatus + ", ";
34 | 							}
35 | 						});
36 | 					}
37 | 
38 | 				});
39 | 			}
40 | 		}
41 | 		public int RowIndex { get; set; }
42 | 		public int RowSpan { get; set; }
43 | 		public int ColumnIndex { get; set; }
44 | 		public int ColumnSpan { get; set; }
45 | 		public List<dynamic> Content { get; set; }
46 | 		public Block Block { get; set; }
47 | 		public float Confidence { get; set; }
48 | 		public Geometry Geometry { get; set; }
49 | 		public string Id { get; set; }
50 | 		public string Text { get; set; }
51 | 
52 | 		public override string ToString() {
53 | 			return this.Text;
54 | 		}
55 | 	}
56 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/Field.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Collections.Generic;
 3 | using Amazon.Textract.Model;
 4 | 
 5 | namespace Amazon.Textract.Model {
 6 | 
 7 | 	public class Field {
 8 | 		public Field(Block block, Dictionary<string, Block> blocks) {
 9 | 			var relationships = block.Relationships;
10 | 			if(relationships != null && relationships.Count > 0) {
11 | 				relationships.ForEach(r => {
12 | 					if(r.Type == "CHILD") {
13 | 						this.Key = new FieldKey(block, r.Ids, blocks);
14 | 					} else if(r.Type == "VALUE") {
15 | 						r.Ids.ForEach(id => {
16 | 							var v = blocks[id];
17 | 							if(v.EntityTypes.Contains("VALUE")) {
18 | 								var vr = v.Relationships;
19 | 								if(vr != null && vr.Count > 0) {
20 | 									vr.ForEach(vc => {
21 | 										if(vc.Type == "CHILD") {
22 | 											this.Value = new FieldValue(v, vc.Ids, blocks);
23 | 										}
24 | 									});
25 | 								}
26 | 							}
27 | 						});
28 | 					}
29 | 				});
30 | 			}
31 | 		}
32 | 		public FieldKey Key { get; set; }
33 | 		public FieldValue Value { get; set; }
34 | 
35 | 		public override string ToString() {
36 | 			var k = this.Key == null ? string.Empty : this.Key.ToString();
37 | 			var v = this.Value == null ? string.Empty : this.Value.ToString();
38 | 			return string.Format(@"
39 |                 {0}Field{0}===={0}
40 |                 Key: {1}, Value: {2}
41 |             ", Environment.NewLine, k, v);
42 | 		}
43 | 	}
44 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/FieldKey.cs:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | class FieldKey:
 4 |     def __init__(self, block, children, blockMap):
 5 |         self._block = block
 6 |         self._confidence = block['Confidence']
 7 |         self._geometry = Geometry(block['Geometry'])
 8 |         self._id = block['Id']
 9 |         self._text = ""
10 |         self._content = []
11 | 
12 |         t = []
13 | 
14 |         for eid in children:
15 |             wb = blockMap[eid]
16 |             if(wb['BlockType'] == "WORD"):
17 |                 w = Word(wb, blockMap)
18 |                 self._content.append(w)
19 |                 t.append(w.text)
20 | 
21 |         if(t):
22 |             self._text = ' '.join(t)
23 | 
24 |     def __str__(self):
25 |         return self._text
26 | 
27 |     @property
28 |     def confidence(self):
29 |         return self._confidence
30 | 
31 |     @property
32 |     def geometry(self):
33 |         return self._geometry
34 | 
35 |     @property
36 |     def id(self):
37 |         return self._id
38 | 
39 |     @property
40 |     def content(self):
41 |         return self._content
42 | 
43 |     @property
44 |     def text(self):
45 |         return self._text
46 | 
47 |     @property
48 |     def block(self):
49 |         return self._block
50 | 
51 |  */
52 | 
53 | using System.Collections.Generic;
54 | using Amazon.Textract.Model;
55 | 
56 | namespace Amazon.Textract.Model {
57 | 	public class FieldKey {
58 | 		public FieldKey(Block block, List<string> children, Dictionary<string, Block> blocks) {
59 | 			this.Block = block;
60 | 			this.Confidence = block.Confidence;
61 | 			this.Geometry = block.Geometry;
62 | 			this.Id = block.Id;
63 | 			this.Text = string.Empty;
64 | 			this.Content = new List<dynamic>();
65 | 
66 | 			var words = new List<string>();
67 | 
68 | 			if(children != null && children.Count > 0) {
69 | 				children.ForEach(c => {
70 | 					var wordBlock = blocks[c];
71 | 					if(wordBlock.BlockType == "WORD") {
72 | 						var w = new Word(wordBlock, blocks);
73 | 						this.Content.Add(w);
74 | 						words.Add(w.Text);
75 | 					}
76 | 				});
77 | 			}
78 | 
79 | 			if(words.Count > 0) {
80 | 				this.Text = string.Join(" ", words);
81 | 			}
82 | 
83 | 		}
84 | 		public List<dynamic> Content { get; set; }
85 | 		public Block Block { get; set; }
86 | 		public float Confidence { get; set; }
87 | 		public Geometry Geometry { get; set; }
88 | 		public string Id { get; set; }
89 | 		public string Text { get; set; }
90 | 
91 | 		public override string ToString() {
92 | 			return Text;
93 | 		}
94 | 	}
95 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/FieldValue.cs:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | /*
  4 | 
  5 | class FieldValue:
  6 |     def __init__(self, block, children, blockMap):
  7 |         self._block = block
  8 |         self._confidence = block['Confidence']
  9 |         self._geometry = Geometry(block['Geometry'])
 10 |         self._id = block['Id']
 11 |         self._text = ""
 12 |         self._content = []
 13 | 
 14 |         t = []
 15 | 
 16 |         for eid in children:
 17 |             wb = blockMap[eid]
 18 |             if(wb['BlockType'] == "WORD"):
 19 |                 w = Word(wb, blockMap)
 20 |                 self._content.append(w)
 21 |                 t.append(w.text)
 22 |             elif(wb['BlockType'] == "SELECTION_ELEMENT"):
 23 |                 se = SelectionElement(wb, blockMap)
 24 |                 self._content.append(se)
 25 |                 self._text = se.selectionStatus
 26 | 
 27 |         if(t):
 28 |             self._text = ' '.join(t)
 29 | 
 30 |     def __str__(self):
 31 |         return self._text
 32 | 
 33 |     @property
 34 |     def confidence(self):
 35 |         return self._confidence
 36 | 
 37 |     @property
 38 |     def geometry(self):
 39 |         return self._geometry
 40 | 
 41 |     @property
 42 |     def id(self):
 43 |         return self._id
 44 | 
 45 |     @property
 46 |     def content(self):
 47 |         return self._content
 48 | 
 49 |     @property
 50 |     def text(self):
 51 |         return self._text
 52 |     
 53 |     @property
 54 |     def block(self):
 55 |         return self._block
 56 | 
 57 |  */
 58 | 
 59 | using System.Collections.Generic;
 60 | using Amazon.Textract.Model;
 61 | 
 62 | namespace Amazon.Textract.Model {
 63 | 	public class FieldValue {
 64 | 		public FieldValue(Block block, List<string> children, Dictionary<string, Block> blocks) {
 65 | 			this.Block = block;
 66 | 			this.Confidence = block.Confidence;
 67 | 			this.Geometry = block.Geometry;
 68 | 			this.Id = block.Id;
 69 | 			this.Text = string.Empty;
 70 | 			this.Content = new List<dynamic>();
 71 | 
 72 | 			var words = new List<string>();
 73 | 			if(children != null && children.Count > 0) {
 74 | 				children.ForEach(c => {
 75 | 					var wordBlock = blocks[c];
 76 | 					if(wordBlock.BlockType == "WORD") {
 77 | 						var w = new Word(wordBlock, blocks);
 78 | 						this.Content.Add(w);
 79 | 						words.Add(w.Text);
 80 | 					} else if(wordBlock.BlockType == "SELECTION_ELEMENT") {
 81 | 						var selection = new SelectionElement(wordBlock, blocks);
 82 | 						this.Content.Add(selection);
 83 | 						words.Add(selection.SelectionStatus);
 84 | 					}
 85 | 				});
 86 | 			}
 87 | 
 88 | 			if(words.Count > 0) {
 89 | 				this.Text = string.Join(" ", words);
 90 | 			}
 91 | 		}
 92 | 		public List<dynamic> Content { get; set; }
 93 | 		public Block Block { get; set; }
 94 | 		public float Confidence { get; set; }
 95 | 		public Geometry Geometry { get; set; }
 96 | 		public string Id { get; set; }
 97 | 		public string Text { get; set; }
 98 | 
 99 | 		public override string ToString() {
100 | 			return Text;
101 | 		}
102 | 	}
103 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/Form.cs:
--------------------------------------------------------------------------------
 1 | using System.Collections.Generic;
 2 | 
 3 | namespace Amazon.Textract.Model {
 4 | 
 5 | 	public class Form {
 6 | 		public List<Field> Fields { get; set; }
 7 | 		private Dictionary<string, Field> fieldMap;
 8 | 
 9 | 		public Form() {
10 | 			this.Fields = new List<Field>();
11 | 			this.fieldMap = new Dictionary<string, Field>();
12 | 		}
13 | 
14 | 		public void AddField(Field field) {
15 | 			this.Fields.Add(field);
16 | 			this.fieldMap.Add(field.Key.ToString(), field);
17 | 		}
18 | 		//public Field GetFieldByKey(string key) {
19 | 		//	return this.fieldMap.GetValueOrDefault(key);
20 | 		//}
21 | 
22 | 		public List<Field> SearchFieldsByKey(string key) {
23 | 			return this.Fields.FindAll(f => f.Key.ToString().ToLower().Contains(key.ToLower()));
24 | 		}
25 | 
26 | 		public override string ToString() {
27 | 			return string.Join("\n", this.Fields);
28 | 		}
29 | 	}
30 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/Line.cs:
--------------------------------------------------------------------------------
  1 | using System;
  2 | using System.Collections.Generic;
  3 | using Amazon.Textract.Model;
  4 | 
  5 | /*
  6 | class Line:
  7 |     def __init__(self, block, blockMap):
  8 | 
  9 |         self._block = block
 10 |         self._confidence = block['Confidence']
 11 |         self._geometry = Geometry(block['Geometry'])
 12 |         self._id = block['Id']
 13 | 
 14 |         self._text = ""
 15 |         if(block['Text']):
 16 |             self._text = block['Text']
 17 | 
 18 |         self._words = []
 19 |         if('Relationships' in block and block['Relationships']):
 20 |             for rs in block['Relationships']:
 21 |                 if(rs['Type'] == 'CHILD'):
 22 |                     for cid in rs['Ids']:
 23 |                         if(blockMap[cid]["BlockType"] == "WORD"):
 24 |                             self._words.append(Word(blockMap[cid], blockMap))
 25 |     def __str__(self):
 26 |         s = "Line\n==========\n"
 27 |         s = s + self._text + "\n"
 28 |         s = s + "Words\n----------\n"
 29 |         for word in self._words:
 30 |             s = s + "[{}]".format(str(word))
 31 |         return s
 32 | 
 33 |     @property
 34 |     def confidence(self):
 35 |         return self._confidence
 36 | 
 37 |     @property
 38 |     def geometry(self):
 39 |         return self._geometry
 40 | 
 41 |     @property
 42 |     def id(self):
 43 |         return self._id
 44 | 
 45 |     @property
 46 |     def words(self):
 47 |         return self._words
 48 | 
 49 |     @property
 50 |     def text(self):
 51 |         return self._text
 52 | 
 53 |     @property
 54 |     def block(self):
 55 |         return self._block
 56 |  */
 57 | 
 58 | namespace Amazon.Textract.Model {
 59 | 	public class Line {
 60 | 		public Line(Block block, Dictionary<string, Block> blocks) {
 61 | 			this.Block = block;
 62 | 			this.Confidence = block.Confidence;
 63 | 			this.Geometry = block.Geometry;
 64 | 			this.Id = block.Id;
 65 | 			this.Text = block == null ? string.Empty : block.Text;
 66 | 			this.Words = new List<Word>();
 67 | 
 68 | 			var relationships = block.Relationships;
 69 | 			if(relationships != null && relationships.Count > 0) {
 70 | 				relationships.ForEach(r => {
 71 | 					if(r.Type == "CHILD") {
 72 | 						r.Ids.ForEach(id => {
 73 |                             var block = blocks[id];
 74 |                             if(block.BlockType == "WORD")
 75 |                                 this.Words.Add(new Word(block, blocks));
 76 |                             else
 77 |                                 this.Words.Add(new Word(null, blocks));
 78 |                         });
 79 | 					}
 80 | 				});
 81 | 			}
 82 | 		}
 83 | 
 84 | 		public float Confidence { get; set; }
 85 | 		public Geometry Geometry { get; set; }
 86 | 		public string Id { get; set; }
 87 | 		public List<Word> Words { get; set; }
 88 | 		public string Text { get; set; }
 89 | 		public Block Block { get; set; }
 90 | 
 91 | 		public override string ToString() {
 92 | 			return string.Format(@"
 93 |                 Line{0}===={0}
 94 |                 {1} {0}
 95 |                 Words{0}----{0}
 96 |                 {2}{0}
 97 |                 ----
 98 |             ", Environment.NewLine, this.Text, string.Join(", ", this.Words));
 99 | 		}
100 | 	}
101 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/NewBoundingBox.cs:
--------------------------------------------------------------------------------
 1 | using Amazon.Textract.Model;
 2 | 
 3 | namespace Amazon.Textract.Model {
 4 | 	public class NewBoundingBox : BoundingBox {
 5 | 		public NewBoundingBox(float width, float height, float left, float top) : base() {
 6 | 			this.Width = width;
 7 | 			this.Height = height;
 8 | 			this.Left = left;
 9 | 			this.Top = top;
10 | 		}
11 | 
12 | 		public override string ToString() {
13 | 			return string.Format("width: {0}, height: {1}, left: {2}, top: {3}", Width, Height, Left, Top);
14 | 		}
15 | 	}
16 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/NewGeometry.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using Amazon.Textract.Model;
 4 | 
 5 | namespace Amazon.Textract.Model {
 6 | 	public class NewGeometry : Geometry {
 7 | 
 8 | 		public NewGeometry(Geometry geometry) : base() {
 9 | 			this.BoundingBox = geometry.BoundingBox;
10 | 			this.Polygon = geometry.Polygon;
11 | 			var bb = new NewBoundingBox(this.BoundingBox.Width, this.BoundingBox.Height, this.BoundingBox.Left, this.BoundingBox.Top);
12 | 			var pgs = new List<Point>();
13 | 			Polygon.ForEach(pg => pgs.Add(new Point {
14 | 				X = pg.X,
15 | 				Y = pg.Y
16 | 			}));
17 | 
18 | 			BoundingBox = bb;
19 | 			Polygon = pgs;
20 | 		}
21 | 
22 | 		public override string ToString() {
23 | 			return string.Format("BoundingBox: {0}{1}", BoundingBox, Environment.NewLine);
24 | 		}
25 | 
26 | 
27 | 	}
28 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/Page.cs:
--------------------------------------------------------------------------------
  1 | using System;
  2 | using System.Collections.Generic;
  3 | using Amazon.Textract.Model;
  4 | 
  5 | namespace Amazon.Textract.Model {
  6 | 
  7 | 	public class Page {
  8 | 		public Page(List<Block> blocks, Dictionary<string, Block> blockMap) {
  9 | 			this.Blocks = blocks;
 10 | 			this.Text = string.Empty;
 11 | 			this.Lines = new List<Line>();
 12 | 			this.Form = new Form();
 13 | 			this.Tables = new List<Table>();
 14 | 			this.Content = new List<dynamic>();
 15 | 
 16 | 			blocks.ForEach(b => {
 17 | 				if(b.BlockType == "PAGE") {
 18 | 					this.Geometry = new NewGeometry(b.Geometry);
 19 | 					this.Id = b.Id;
 20 | 				} else if(b.BlockType == "LINE") {
 21 | 					var l = new Line(b, blockMap);
 22 | 					this.Lines.Add(l);
 23 | 					this.Content.Add(l);
 24 | 					this.Text = this.Text + l.Text + Environment.NewLine;
 25 | 				} else if(b.BlockType == "TABLE") {
 26 | 					var t = new Table(b, blockMap);
 27 | 					this.Tables.Add(t);
 28 | 					this.Content.Add(t);
 29 | 				} else if(b.BlockType == "KEY_VALUE_SET") {
 30 | 					if(b.EntityTypes.Contains("KEY")) {
 31 | 						var f = new Field(b, blockMap);
 32 | 						if(f.Key != null) {
 33 | 							this.Form.AddField(f);
 34 | 							this.Content.Add(f);
 35 | 						}
 36 | 					}
 37 | 				}
 38 | 			});
 39 | 
 40 | 		}
 41 | 
 42 | 		public List<IndexedText> GetLinesInReadingOrder() {
 43 | 			var lines = new List<IndexedText>();
 44 | 			var columns = new List<Column>();
 45 | 			this.Lines.ForEach(line => {
 46 | 				var columnFound = false;
 47 | 				for(var index = 0; index < columns.Count; index++) {
 48 | 					var column = columns[index];
 49 | 					var bb = line.Geometry.BoundingBox;
 50 | 					var bbLeft = bb.Left;
 51 | 					var bbRight = bb.Left + bb.Width;
 52 | 					var bbCentre = bb.Left + (bb.Width / 2);
 53 | 					var columnCentre = column.Left + (column.Right / 2);
 54 | 
 55 | 					if((bbCentre > column.Left && bbCentre < column.Right) || (columnCentre > bbLeft && columnCentre < bbRight)) {
 56 | 						lines.Add(new IndexedText { ColumnIndex = index, Text = line.Text });
 57 | 						columnFound = true;
 58 | 						break;
 59 | 					}
 60 | 				}
 61 | 				if(!columnFound) {
 62 | 					var bb = line.Geometry.BoundingBox;
 63 | 					columns.Add(new Column { Left = bb.Left, Right = bb.Left + bb.Width });
 64 | 					lines.Add(new IndexedText { ColumnIndex = columns.Count - 1, Text = line.Text });
 65 | 				}
 66 | 			});
 67 | 			lines.FindAll(line => line.ColumnIndex == 0).ForEach(line => Console.WriteLine(line));
 68 | 			return lines;
 69 | 		}
 70 | 
 71 | 		public string GetTextInReadingOrder() {
 72 | 			var lines = this.GetLinesInReadingOrder();
 73 | 			var text = string.Empty;
 74 | 			lines.ForEach(line => {
 75 | 				text = text + line.Text + "\n";
 76 | 			});
 77 | 			return text;
 78 | 		}
 79 | 
 80 | 
 81 | 		public List<Block> Blocks { get; set; }
 82 | 		public string Text { get; set; }
 83 | 		public List<Line> Lines { get; set; }
 84 | 		public Form Form { get; set; }
 85 | 		public List<Table> Tables { get; set; }
 86 | 		public List<dynamic> Content { get; set; }
 87 | 		public Geometry Geometry { get; set; }
 88 | 		public string Id { get; set; }
 89 | 
 90 | 		public override string ToString() {
 91 | 			var result = new List<string>();
 92 | 			result.Add(string.Format("Page{0}===={0}", Environment.NewLine));
 93 | 			this.Content.ForEach(c => {
 94 | 				result.Add($"{Environment.NewLine}{c}");
 95 | 			});
 96 | 			return string.Join("", result);
 97 | 		}
 98 | 
 99 | 		public class Column {
100 | 			public float Left { get; set; }
101 | 			public float Right { get; set; }
102 | 
103 | 			public override string ToString() {
104 | 				return string.Format("Left: {0}, Right :{1}", this.Left, this.Right);
105 | 			}
106 | 		}
107 | 
108 | 		public class IndexedText {
109 | 			public int ColumnIndex { get; set; }
110 | 			public string Text { get; set; }
111 | 
112 | 			public override string ToString() {
113 | 				return string.Format("[{0}] {1}", this.ColumnIndex, this.Text);
114 | 			}
115 | 		}
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/Row.cs:
--------------------------------------------------------------------------------
 1 | using System.Collections.Generic;
 2 | 
 3 | namespace Amazon.Textract.Model {
 4 | 	public class Row {
 5 | 		public Row() {
 6 | 			this.Cells = new List<Cell>();
 7 | 		}
 8 | 		public List<Cell> Cells { get; set; }
 9 | 
10 | 		public override string ToString() {
11 | 			var result = new List<string>();
12 | 			this.Cells.ForEach(c => {
13 | 				result.Add(string.Format("[{0}]", c));
14 | 			});
15 | 			return string.Join("", result);
16 | 		}
17 | 	}
18 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/SelectionElement.cs:
--------------------------------------------------------------------------------
 1 | /*
 2 | class SelectionElement:
 3 |     def __init__(self, block, blockMap):
 4 |         self._confidence = block['Confidence']
 5 |         self._geometry = Geometry(block['Geometry'])
 6 |         self._id = block['Id']
 7 |         self._selectionStatus = block['SelectionStatus']
 8 | 
 9 |     @property
10 |     def confidence(self):
11 |         return self._confidence
12 | 
13 |     @property
14 |     def geometry(self):
15 |         return self._geometry
16 | 
17 |     @property
18 |     def id(self):
19 |         return self._id
20 | 
21 |     @property
22 |     def selectionStatus(self):
23 |         return self._selectionStatus
24 |  */
25 | 
26 | using System.Collections.Generic;
27 | using Amazon.Textract.Model;
28 | 
29 | namespace Amazon.Textract.Model {
30 | 	public class SelectionElement {
31 | 		public SelectionElement(Block block, Dictionary<string, Block> blocks) {
32 | 			this.Confidence = block.Confidence;
33 | 			this.Geometry = block.Geometry;
34 | 			this.Id = block.Id;
35 | 			this.SelectionStatus = block.SelectionStatus;
36 | 		}
37 | 		public float Confidence { get; set; }
38 | 		public Geometry Geometry { get; set; }
39 | 		public string Id { get; set; }
40 | 		public string SelectionStatus { get; set; }
41 | 
42 | 	}
43 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/Table.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Collections.Generic;
 3 | using Amazon.Textract.Model;
 4 | 
 5 | namespace Amazon.Textract.Model {
 6 | 	public class Table {
 7 | 		public Table(Block block, Dictionary<string, Block> blocks) {
 8 | 			this.Block = block;
 9 | 			this.Confidence = block.Confidence;
10 | 			this.Geometry = block.Geometry;
11 | 			this.Id = block.Id;
12 | 			this.Rows = new List<Row>();
13 | 			var ri = 1;
14 | 			var row = new Row();
15 | 
16 | 			var relationships = block.Relationships;
17 | 			if(relationships != null && relationships.Count > 0) {
18 | 				relationships.ForEach(r => {
19 | 					if(r.Type == "CHILD") {
20 | 						r.Ids.ForEach(id => {
21 | 							var cell = new Cell(blocks[id], blocks);
22 | 							if(cell.RowIndex > ri) {
23 | 								this.Rows.Add(row);
24 | 								row = new Row();
25 | 								ri = cell.RowIndex;
26 | 							}
27 | 							row.Cells.Add(cell);
28 | 						});
29 | 						if(row != null && row.Cells.Count > 0)
30 | 							this.Rows.Add(row);
31 | 					}
32 | 				});
33 | 			}
34 | 		}
35 | 		public List<Row> Rows { get; set; }
36 | 		public Block Block { get; set; }
37 | 		public float Confidence { get; set; }
38 | 		public Geometry Geometry { get; set; }
39 | 		public string Id { get; set; }
40 | 
41 | 		public override string ToString() {
42 | 			var result = new List<string>();
43 | 			result.Add(string.Format("Table{0}===={0}", Environment.NewLine));
44 | 			this.Rows.ForEach(r => {
45 | 				result.Add(string.Format("Row{0}===={0}{1}{0}", Environment.NewLine, r));
46 | 			});
47 | 			return string.Join("", result);
48 | 		}
49 | 	}
50 | }


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/TextractDocument.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System.Collections.Generic;
 2 | using Amazon.Textract.Model;
 3 | 
 4 | namespace Amazon.Textract.Model {
 5 | 	public class TextractDocument {
 6 | 		private Dictionary<string, Block> blockMap = new Dictionary<string, Block>();
 7 | 		private List<List<Block>> documentPages = new List<List<Block>>();
 8 | 
 9 | 		public TextractDocument(GetDocumentAnalysisResponse response) {
10 | 			this.Pages = new List<Page>();
11 | 			this.ResponsePages = new List<GetDocumentAnalysisResponse>();
12 | 			this.ResponsePages.Add(response);
13 | 			this.ParseDocumentPagesAndBlockMap();
14 | 			this.Parse();
15 | 		}
16 | 
17 | 		private void ParseDocumentPagesAndBlockMap() {
18 |             List<Block> documentPage = null;
19 |             this.ResponsePages.ForEach(page => {
20 | 				page.Blocks.ForEach(block => {
21 | 					this.blockMap.Add(block.Id, block);
22 | 					if(block.BlockType == "PAGE") 
23 |                     {
24 |                         if (documentPage != null) 
25 |                         {
26 | 							this.documentPages.Add(documentPage);
27 | 						}
28 | 						documentPage = new List<Block>();
29 | 						documentPage.Add(block);
30 | 					} else {
31 |                         if (documentPage == null)
32 |                         {
33 |                             documentPage = new List<Block>();
34 |                         }
35 |                         documentPage.Add(block);
36 |                     }
37 | 				});
38 | 			});
39 | 
40 |             if (documentPage != null)
41 |             {
42 |                 this.documentPages.Add(documentPage);
43 |             }
44 |         }
45 | 
46 | 		private void Parse() {
47 | 			this.documentPages.ForEach(documentPage => {
48 | 				var page = new Page(documentPage, this.blockMap);
49 | 				this.Pages.Add(page);
50 | 			});
51 | 		}
52 | 
53 | 		public Block GetBlockById(string blockId) {
54 | 			return this.blockMap[blockId];
55 | 		}
56 | 
57 |         public List<GetDocumentAnalysisResponse> ResponsePages { get; set; }
58 | 		public List<Page> Pages { get; set; }
59 | 		public List<List<Block>> PageBlocks {
60 | 			get {
61 | 				return this.documentPages;
62 | 			}
63 | 		}
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/src-csharp/TextractExtensions/Word.cs:
--------------------------------------------------------------------------------
 1 | /*
 2 | class Word:
 3 |     def __init__(self, block, blockMap):
 4 |         self._block = block
 5 |         self._confidence = block['Confidence']
 6 |         self._geometry = Geometry(block['Geometry'])
 7 |         self._id = block['Id']
 8 |         self._text = ""
 9 |         if(block['Text']):
10 |             self._text = block['Text']
11 | 
12 |     def __str__(self):
13 |         return self._text
14 | 
15 |     @property
16 |     def confidence(self):
17 |         return self._confidence
18 | 
19 |     @property
20 |     def geometry(self):
21 |         return self._geometry
22 | 
23 |     @property
24 |     def id(self):
25 |         return self._id
26 | 
27 |     @property
28 |     def text(self):
29 |         return self._text
30 | 
31 |     @property
32 |     def block(self):
33 |         return self._block
34 |  */
35 | 
36 | using System.Collections.Generic;
37 | using Amazon.Textract.Model;
38 | 
39 | namespace Amazon.Textract.Model {
40 | 	public class Word {
41 | 		public Word(Block block, Dictionary<string, Block> blocks) {
42 | 			this.Block = block ?? new Block();
43 | 			this.Blocks = blocks ?? new Dictionary<string, Block>();
44 | 			this.Confidence = block == null ? 0 : block.Confidence;
45 | 			this.Geometry = block == null ? new Geometry() : block.Geometry;
46 |             this.Id = block == null ? string.Empty : block.Id;
47 | 			this.Text = block == null ? string.Empty : block.Text;
48 | 		}
49 | 
50 | 		public Block Block { get; set; }
51 | 		public Dictionary<string, Block> Blocks { get; set; }
52 | 		public float Confidence { get; set; }
53 | 		public Geometry Geometry { get; set; }
54 | 		public string Id { get; set; }
55 | 		public string Text { get; set; }
56 | 
57 | 		public override string ToString() {
58 | 			return Text;
59 | 		}
60 | 	}
61 | }


--------------------------------------------------------------------------------
/src-csharp/appsettings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "AWS": {
3 |     "Profile": "default",
4 |     "Region": "us-west-2"
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/src-csharp/dotnet-core.csproj:
--------------------------------------------------------------------------------
 1 | <Project Sdk="Microsoft.NET.Sdk">
 2 | 
 3 |   <PropertyGroup>
 4 |     <OutputType>Exe</OutputType>
 5 |     <TargetFramework>netcoreapp2.2</TargetFramework>
 6 |     <RootNamespace>Dotnet_Core</RootNamespace>
 7 |   </PropertyGroup>
 8 | 
 9 |   <ItemGroup>
10 |     <PackageReference Include="AWSSDK.Comprehend" Version="3.3.104.13" />
11 |     <PackageReference Include="AWSSDK.ComprehendMedical" Version="3.3.100.31" />
12 |     <PackageReference Include="AWSSDK.Extensions.NETCore.Setup" Version="3.3.100.1" />
13 |     <PackageReference Include="AWSSDK.S3" Version="3.3.102.12" />
14 |     <PackageReference Include="AWSSDK.Textract" Version="3.3.101.23" />
15 |     <PackageReference Include="AWSSDK.Translate" Version="3.3.100.28" />
16 |     <PackageReference Include="Microsoft.Extensions.Configuration" Version="2.2.0" />
17 |     <PackageReference Include="Microsoft.Extensions.Configuration.EnvironmentVariables" Version="2.2.4" />
18 |     <PackageReference Include="Microsoft.Extensions.Configuration.Json" Version="2.2.0" />
19 |     <PackageReference Include="NEST" Version="6.8.0" />
20 |     <PackageReference Include="System.Drawing.Common" Version="4.5.1" />
21 |   </ItemGroup>
22 | 
23 |   <ItemGroup>
24 |     <None Update="appsettings.json">
25 |       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
26 |     </None>
27 |   </ItemGroup>
28 | 
29 |   <ItemGroup>
30 |     <None Update="test-files\*">
31 |       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
32 |     </None>
33 |     
34 |   </ItemGroup>
35 |   
36 | 
37 | </Project>
38 | 


--------------------------------------------------------------------------------
/src-csharp/test-files/Amazon-Textract-Pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/Amazon-Textract-Pdf.pdf


--------------------------------------------------------------------------------
/src-csharp/test-files/employmentapp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/employmentapp.png


--------------------------------------------------------------------------------
/src-csharp/test-files/expense.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/expense.png


--------------------------------------------------------------------------------
/src-csharp/test-files/medical-notes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/medical-notes.png


--------------------------------------------------------------------------------
/src-csharp/test-files/redacted-employmentapp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/redacted-employmentapp.png


--------------------------------------------------------------------------------
/src-csharp/test-files/simple-document-image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/simple-document-image.jpg


--------------------------------------------------------------------------------
/src-csharp/test-files/two-column-image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/two-column-image.jpg


--------------------------------------------------------------------------------