├── .DS_Store ├── .github └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── python ├── .DS_Store ├── 01-detect-text-local.py ├── 02-detect-text-s3.py ├── 03-reading-order.py ├── 04-nlp-comprehend.py ├── 05-nlp-medical.py ├── 06-translate.py ├── 07-search.py ├── 08-forms.py ├── 09-forms-redaction.py ├── 10-tables.py ├── 11-tables-expense.py ├── 12-pdf-text.py ├── 13-signature.py ├── Amazon-Textract-Pdf.pdf ├── Analyze_Lending_Sample.ipynb ├── OneKeyValue.png ├── OneLine.png ├── Textract-Analyze-ID.ipynb ├── Textract-MergeCell-Statement.pdf ├── Textract-Table-Merged-Cells-And-Headers.ipynb ├── Textract.ipynb ├── custom-queries │ ├── custom-queries-checks-blog.ipynb │ ├── samples │ │ ├── checks-annotations.zip │ │ └── checks-samples.zip │ └── screenshots │ │ ├── checks-notebook-step1.png │ │ ├── checks-notebook-step2.png │ │ ├── checks-notebook-step5_1.png │ │ ├── checks-notebook-step6.png │ │ ├── checks-notebook-step7.png │ │ └── checks-notebook-step8.png ├── employmentapp.png ├── expense.png ├── extraction-parsers │ ├── cms1500-parser.ipynb │ ├── samples │ │ ├── CMS1500-sample.png │ │ └── ub-04-Form-sample.png │ └── ub04-parser.ipynb ├── medical-notes.png ├── patient_intake_form_sample.jpg ├── queries │ ├── insurance-card.ipynb │ ├── insurance-card.png │ ├── mortgage-note.ipynb │ ├── mortgage-note.jpg │ ├── paystub-questions_full.csv │ ├── paystub-questions_subset.csv │ ├── paystub.ipynb │ ├── paystub.jpg │ ├── vaccination-card-s3-object.ipynb │ ├── vaccination-card.ipynb │ └── vaccination.png ├── simple-document-image.jpg ├── textract-textractor-tools.ipynb ├── two-column-image.jpg └── verification-of-employment.png └── src-csharp ├── .gitignore ├── ArgHandlers ├── DetectTextHandler.cs ├── DetectTextS3Handler.cs ├── FormsHandler.cs ├── FormsRedactionHandler.cs ├── NlpComprehendHandler.cs ├── NlpComprehendMedicalHandler.cs ├── PdfTextHandler.cs ├── ReadingOrderHandler.cs ├── SearchHandler.cs ├── TablesExpenseHandler.cs ├── TablesHandler.cs └── TranslateHandler.cs ├── Program.cs ├── Readme.md ├── Services ├── Column.cs ├── ComprehendService.cs ├── ElasticSearchService.cs ├── IndexedText.cs ├── TextractTextAnalysisService.cs ├── TextractTextDetectionService.cs └── TranslateService.cs ├── TextractExtensions ├── Cell.cs ├── Field.cs ├── FieldKey.cs ├── FieldValue.cs ├── Form.cs ├── Line.cs ├── NewBoundingBox.cs ├── NewGeometry.cs ├── Page.cs ├── Row.cs ├── SelectionElement.cs ├── Table.cs ├── TextractDocument.cs └── Word.cs ├── appsettings.json ├── dotnet-core.csproj └── test-files ├── Amazon-Textract-Pdf.pdf ├── employmentapp.png ├── expense.png ├── medical-notes.png ├── redacted-employmentapp.png ├── simple-document-image.jpg └── two-column-image.jpg /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/.DS_Store -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | *Issue #, if available:* 2 | 3 | *Description of changes:* 4 | 5 | 6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .python-version 3 | 4 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check [existing open](https://github.com/aws-samples/amazon-textract-code-samples/issues), or [recently closed](https://github.com/aws-samples/amazon-textract-code-samples/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/amazon-textract-code-samples/labels/help%20wanted) issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](https://github.com/aws-samples/amazon-textract-code-samples/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Amazon Textract Code Samples 2 | 3 | This repository contains example code snippets showing how Amazon Textract and other AWS services can be used to get insights from documents. 4 | 5 | ## Usage 6 | 7 | python3 01-detect-text-local.py 8 | 9 | For examples that use S3 bucket, upload sample images to an S3 bucket and update variable "s3BucketName" in the example before running it. 10 | 11 | ## Python Samples 12 | 13 | | Argument | Description | 14 | | ----------------------------------------------------------- | ---------------------------------------------------------- | 15 | | [01-detect-text-local.py](./python/01-detect-text-local.py) | Example showing processing a document on local machine. | 16 | | [02-detect-text-s3.py](./python/02-detect-text-s3.py) | Example showing processing a document in Amazon S3 bucket. | 17 | | [03-reading-order.py](./python/03-reading-order.py) | Example showing printing document in reading order. | 18 | | [04-nlp-comprehend.py](./python/04-nlp-comprehend.py) | Example showing detecting entities and sentiment. | 19 | | [05-nlp-medical.py](./python/05-nlp-medical.py) | Example showing detecting medical entities. | 20 | | [06-translate.py](./python/06-translate.py) | Example showing translation of documents. | 21 | | [07-search.py](./python/07-search.py) | Example showing document indexing in Elasticsearch. | 22 | | [08-forms.py](./python/08-forms.py) | Example showing form (key/value) processing. | 23 | | [09-forms-redaction.py](./python/09-forms-redaction.py) | Example showing redacting information in document. | 24 | | [10-tables.py](./python/10-tables.py) | Example showing table processing. | 25 | | [11-tables-expense.py](./python/11-tables-expense.py) | Example showing validation of table data. | 26 | | [12-pdf-text.py](./python/12-pdf-text.py) | Example showing PDF document processing. | 27 | 28 | ## .NET Usage 29 | 30 | ``` 31 | Usage: dotnet run [--switch] 32 | To run this console app, use the following valid switches one at a time: 33 | --detect-text-local 34 | --detect-text-s3 35 | --pdf-text 36 | --forms 37 | --forms-redaction 38 | --tables 39 | --tables-expense 40 | --reading-order 41 | --nlp-comprehend 42 | --nlp-medical 43 | --translate 44 | --search 45 | e.g. dotnet run --detect-text-s3 46 | ``` 47 | 48 | ## .NET Samples 49 | 50 | Go to `src-csharp` folder for .NET samples 51 | 52 | | Argument | Description | 53 | | ------------------- | ---------------------------------------------------------- | 54 | | --detect-text-local | Example showing processing a document on local machine. | 55 | | --detect-text-s3 | Example showing processing a document in Amazon S3 bucket. | 56 | | --pdf-text | Example showing PDF document processing. | 57 | | --forms | Example showing form (key/value) processing. | 58 | | --forms-redaction | Example showing redacting information in document. | 59 | | --tables | Example showing table processing. | 60 | | --tables-expense | Example showing validation of table data. | 61 | | --reading-order | Example showing printing document in reading order. | 62 | | --nlp-comprehend | Example showing detecting entities and sentiment. | 63 | | --nlp-medical | Example showing detecting medical entities. | 64 | | --translate | Example showing translation of documents. | 65 | | --search | Example showing document indexing in Elasticsearch. | 66 | 67 | ## Other Resources 68 | 69 | - [Large scale document processing with Amazon Textract - Reference Architecture](https://github.com/aws-samples/amazon-textract-serverless-large-scale-document-processing) 70 | - [Batch processing tool](https://github.com/aws-samples/amazon-textract-textractor) 71 | - [JSON response parser](https://github.com/aws-samples/amazon-textract-response-parser) 72 | 73 | ## License Summary 74 | 75 | This sample code is made available under the MIT-0 license. See the LICENSE file. 76 | -------------------------------------------------------------------------------- /python/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/.DS_Store -------------------------------------------------------------------------------- /python/01-detect-text-local.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | # Document 4 | documentName = "simple-document-image.jpg" 5 | 6 | # Read document content 7 | with open(documentName, 'rb') as document: 8 | imageBytes = bytearray(document.read()) 9 | 10 | # Amazon Textract client 11 | textract = boto3.client('textract') 12 | 13 | # Call Amazon Textract 14 | response = textract.detect_document_text(Document={'Bytes': imageBytes}) 15 | 16 | #print(response) 17 | 18 | # Print detected text 19 | for item in response["Blocks"]: 20 | if item["BlockType"] == "LINE": 21 | print ('\033[94m' + item["Text"] + '\033[0m') 22 | -------------------------------------------------------------------------------- /python/02-detect-text-s3.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | # Document 4 | s3BucketName = "ki-textract-demo-docs" 5 | documentName = "simple-document-image.jpg" 6 | 7 | # Amazon Textract client 8 | textract = boto3.client('textract') 9 | 10 | # Call Amazon Textract 11 | response = textract.detect_document_text( 12 | Document={ 13 | 'S3Object': { 14 | 'Bucket': s3BucketName, 15 | 'Name': documentName 16 | } 17 | }) 18 | 19 | #print(response) 20 | 21 | # Print detected text 22 | for item in response["Blocks"]: 23 | if item["BlockType"] == "LINE": 24 | print ('\033[94m' + item["Text"] + '\033[0m') 25 | -------------------------------------------------------------------------------- /python/03-reading-order.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | # Document 4 | documentName = "two-column-image.jpg" 5 | 6 | # Amazon Textract client 7 | textract = boto3.client('textract') 8 | 9 | # Call Amazon Textract 10 | with open(documentName, "rb") as document: 11 | response = textract.detect_document_text( 12 | Document={ 13 | 'Bytes': document.read(), 14 | } 15 | ) 16 | 17 | #print(response) 18 | 19 | # Detect columns and print lines 20 | columns = [] 21 | lines = [] 22 | for item in response["Blocks"]: 23 | if item["BlockType"] == "LINE": 24 | column_found=False 25 | for index, column in enumerate(columns): 26 | bbox_left = item["Geometry"]["BoundingBox"]["Left"] 27 | bbox_right = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"] 28 | bbox_centre = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]/2 29 | column_centre = column['left'] + column['right']/2 30 | 31 | if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right): 32 | #Bbox appears inside the column 33 | lines.append([index, item["Text"]]) 34 | column_found=True 35 | break 36 | if not column_found: 37 | columns.append({'left':item["Geometry"]["BoundingBox"]["Left"], 'right':item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]}) 38 | lines.append([len(columns)-1, item["Text"]]) 39 | 40 | lines.sort(key=lambda x: x[0]) 41 | for line in lines: 42 | print (line[1]) 43 | -------------------------------------------------------------------------------- /python/04-nlp-comprehend.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | # Document 4 | documentName = "simple-document-image.jpg" 5 | 6 | # Amazon Textract client 7 | textract = boto3.client('textract') 8 | 9 | # Call Amazon Textract 10 | with open(documentName, "rb") as document: 11 | response = textract.detect_document_text( 12 | Document={ 13 | 'Bytes': document.read(), 14 | } 15 | ) 16 | 17 | #print(response) 18 | 19 | # Print text 20 | print("\nText\n========") 21 | text = "" 22 | for item in response["Blocks"]: 23 | if item["BlockType"] == "LINE": 24 | print ('\033[94m' + item["Text"] + '\033[0m') 25 | text = text + " " + item["Text"] 26 | 27 | # Amazon Comprehend client 28 | comprehend = boto3.client('comprehend') 29 | 30 | # Detect sentiment 31 | sentiment = comprehend.detect_sentiment(LanguageCode="en", Text=text) 32 | print ("\nSentiment\n========\n{}".format(sentiment.get('Sentiment'))) 33 | 34 | # Detect entities 35 | entities = comprehend.detect_entities(LanguageCode="en", Text=text) 36 | print("\nEntities\n========") 37 | for entity in entities["Entities"]: 38 | print ("{}\t=>\t{}".format(entity["Type"], entity["Text"])) 39 | -------------------------------------------------------------------------------- /python/05-nlp-medical.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | # Document 4 | documentName = "medical-notes.png" 5 | 6 | # Amazon Textract client 7 | textract = boto3.client('textract') 8 | 9 | # Call Amazon Textract 10 | with open(documentName, "rb") as document: 11 | response = textract.detect_document_text( 12 | Document={ 13 | 'Bytes': document.read(), 14 | } 15 | ) 16 | 17 | #print(response) 18 | 19 | # Print text 20 | print("\nText\n========") 21 | text = "" 22 | for item in response["Blocks"]: 23 | if item["BlockType"] == "LINE": 24 | print ('\033[94m' + item["Text"] + '\033[0m') 25 | text = text + " " + item["Text"] 26 | 27 | # Amazon Comprehend client 28 | comprehend = boto3.client('comprehendmedical') 29 | 30 | # Detect medical entities 31 | entities = comprehend.detect_entities(Text=text) 32 | print("\nMedical Entities\n========") 33 | for entity in entities["Entities"]: 34 | print("- {}".format(entity["Text"])) 35 | print (" Type: {}".format(entity["Type"])) 36 | print (" Category: {}".format(entity["Category"])) 37 | if(entity["Traits"]): 38 | print(" Traits:") 39 | for trait in entity["Traits"]: 40 | print (" - {}".format(trait["Name"])) 41 | print("\n") 42 | -------------------------------------------------------------------------------- /python/06-translate.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | # Document 4 | documentName = "simple-document-image.jpg" 5 | 6 | # Amazon Textract client 7 | textract = boto3.client('textract') 8 | 9 | # Call Amazon Textract 10 | with open(documentName, "rb") as document: 11 | response = textract.detect_document_text( 12 | Document={ 13 | 'Bytes': document.read(), 14 | } 15 | ) 16 | 17 | #print(response) 18 | 19 | # Amazon Translate client 20 | translate = boto3.client('translate') 21 | 22 | print ('') 23 | for item in response["Blocks"]: 24 | if item["BlockType"] == "LINE": 25 | print ('\033[94m' + item["Text"] + '\033[0m') 26 | result = translate.translate_text(Text=item["Text"], SourceLanguageCode="en", TargetLanguageCode="de") 27 | print ('\033[92m' + result.get('TranslatedText') + '\033[0m') 28 | print ('') 29 | -------------------------------------------------------------------------------- /python/07-search.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from elasticsearch import Elasticsearch, RequestsHttpConnection 3 | from requests_aws4auth import AWS4Auth 4 | 5 | def indexDocument(bucketName, objectName, text): 6 | 7 | # Update host with endpoint of your Elasticsearch cluster 8 | #host = "search--xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com 9 | host = "searchxxxxxxxxxxxxxxxx.us-east-1.es.amazonaws.com" 10 | region = 'us-east-1' 11 | 12 | if(text): 13 | service = 'es' 14 | ss = boto3.Session() 15 | credentials = ss.get_credentials() 16 | region = ss.region_name 17 | 18 | awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) 19 | 20 | es = Elasticsearch( 21 | hosts = [{'host': host, 'port': 443}], 22 | http_auth = awsauth, 23 | use_ssl = True, 24 | verify_certs = True, 25 | connection_class = RequestsHttpConnection 26 | ) 27 | 28 | document = { 29 | "name": "{}".format(objectName), 30 | "bucket" : "{}".format(bucketName), 31 | "content" : text 32 | } 33 | 34 | es.index(index="textract", doc_type="document", id=objectName, body=document) 35 | 36 | print("Indexed document: {}".format(objectName)) 37 | 38 | # Document 39 | s3BucketName = "ki-textract-demo-docs" 40 | documentName = "simple-document-image.jpg" 41 | 42 | # Amazon Textract client 43 | textract = boto3.client('textract') 44 | 45 | # Call Amazon Textract 46 | response = textract.detect_document_text( 47 | Document={ 48 | 'S3Object': { 49 | 'Bucket': s3BucketName, 50 | 'Name': documentName 51 | } 52 | }) 53 | 54 | #print(response) 55 | 56 | # Print detected text 57 | text = "" 58 | for item in response["Blocks"]: 59 | if item["BlockType"] == "LINE": 60 | print ('\033[94m' + item["Text"] + '\033[0m') 61 | text += item["Text"] 62 | 63 | indexDocument(s3BucketName, documentName, text) 64 | 65 | # You can view index documents in Kibana Dashboard -------------------------------------------------------------------------------- /python/08-forms.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from trp import Document 3 | 4 | # Document 5 | documentName = "employmentapp.png" 6 | 7 | # Amazon Textract client 8 | textract = boto3.client('textract') 9 | 10 | # Call Amazon Textract 11 | with open(documentName, "rb") as document: 12 | response = textract.analyze_document( 13 | Document={ 14 | 'Bytes': document.read(), 15 | }, 16 | FeatureTypes=["FORMS"]) 17 | 18 | #print(response) 19 | 20 | doc = Document(response) 21 | 22 | for page in doc.pages: 23 | # Print fields 24 | print("Fields:") 25 | for field in page.form.fields: 26 | print("Key: {}, Value: {}".format(field.key, field.value)) 27 | 28 | # Get field by key 29 | print("\nGet Field by Key:") 30 | key = "Phone Number:" 31 | field = page.form.getFieldByKey(key) 32 | if(field): 33 | print("Key: {}, Value: {}".format(field.key, field.value)) 34 | 35 | # Search fields by key 36 | print("\nSearch Fields:") 37 | key = "address" 38 | fields = page.form.searchFieldsByKey(key) 39 | for field in fields: 40 | print("Key: {}, Value: {}".format(field.key, field.value)) 41 | -------------------------------------------------------------------------------- /python/09-forms-redaction.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from trp import Document 3 | from PIL import Image, ImageDraw 4 | 5 | # Document 6 | documentName = "employmentapp.png" 7 | 8 | # Amazon Textract client 9 | textract = boto3.client('textract') 10 | 11 | # Call Amazon Textract 12 | with open(documentName, "rb") as document: 13 | response = textract.analyze_document( 14 | Document={ 15 | 'Bytes': document.read(), 16 | }, 17 | FeatureTypes=["FORMS"]) 18 | 19 | #print(response) 20 | 21 | doc = Document(response) 22 | 23 | # Redact document 24 | img = Image.open(documentName) 25 | 26 | width, height = img.size 27 | 28 | if(doc.pages): 29 | page = doc.pages[0] 30 | for field in page.form.fields: 31 | if(field.key and field.value and "address" in field.key.text.lower()): 32 | #if(field.key and field.value): 33 | print("Redacting => Key: {}, Value: {}".format(field.key.text, field.value.text)) 34 | 35 | x1 = field.value.geometry.boundingBox.left*width 36 | y1 = field.value.geometry.boundingBox.top*height-2 37 | x2 = x1 + (field.value.geometry.boundingBox.width*width)+5 38 | y2 = y1 + (field.value.geometry.boundingBox.height*height)+2 39 | 40 | draw = ImageDraw.Draw(img) 41 | draw.rectangle([x1, y1, x2, y2], fill="Black") 42 | 43 | img.save("redacted-{}".format(documentName)) 44 | -------------------------------------------------------------------------------- /python/10-tables.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from trp import Document 3 | 4 | # Document 5 | documentName = "employmentapp.png" 6 | 7 | # Amazon Textract client 8 | textract = boto3.client('textract') 9 | 10 | # Call Amazon Textract 11 | with open(documentName, "rb") as document: 12 | response = textract.analyze_document( 13 | Document={ 14 | 'Bytes': document.read(), 15 | }, 16 | FeatureTypes=["TABLES"]) 17 | 18 | #print(response) 19 | 20 | doc = Document(response) 21 | 22 | for page in doc.pages: 23 | # Print tables 24 | for table in page.tables: 25 | for r, row in enumerate(table.rows): 26 | for c, cell in enumerate(row.cells): 27 | print("Table[{}][{}] = {}".format(r, c, cell.text)) 28 | -------------------------------------------------------------------------------- /python/11-tables-expense.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from trp import Document 3 | 4 | # Document 5 | documentName = "expense.png" 6 | 7 | # Amazon Textract client 8 | textract = boto3.client('textract') 9 | 10 | # Call Amazon Textract 11 | with open(documentName, "rb") as document: 12 | response = textract.analyze_document( 13 | Document={ 14 | 'Bytes': document.read(), 15 | }, 16 | FeatureTypes=["TABLES"]) 17 | 18 | #print(response) 19 | 20 | doc = Document(response) 21 | 22 | def isFloat(input): 23 | try: 24 | float(input) 25 | except ValueError: 26 | return False 27 | return True 28 | 29 | warning = "" 30 | for page in doc.pages: 31 | # Print tables 32 | for table in page.tables: 33 | for r, row in enumerate(table.rows): 34 | itemName = "" 35 | for c, cell in enumerate(row.cells): 36 | print("Table[{}][{}] = {}".format(r, c, cell.text)) 37 | if(c == 0): 38 | itemName = cell.text 39 | elif(c == 4 and isFloat(cell.text)): 40 | value = float(cell.text) 41 | if(value > 1000): 42 | warning += "{} is greater than $1000.".format(itemName) 43 | if(warning): 44 | print("\nReview needed:\n====================\n" + warning) 45 | -------------------------------------------------------------------------------- /python/12-pdf-text.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import time 3 | 4 | 5 | def start_job(client, s3_bucket_name, object_name): 6 | response = None 7 | response = client.start_document_text_detection( 8 | DocumentLocation={ 9 | 'S3Object': { 10 | 'Bucket': s3_bucket_name, 11 | 'Name': object_name 12 | }}) 13 | 14 | return response["JobId"] 15 | 16 | 17 | def is_job_complete(client, job_id): 18 | time.sleep(1) 19 | response = client.get_document_text_detection(JobId=job_id) 20 | status = response["JobStatus"] 21 | print("Job status: {}".format(status)) 22 | 23 | while(status == "IN_PROGRESS"): 24 | time.sleep(1) 25 | response = client.get_document_text_detection(JobId=job_id) 26 | status = response["JobStatus"] 27 | print("Job status: {}".format(status)) 28 | 29 | return status 30 | 31 | 32 | def get_job_results(client, job_id): 33 | pages = [] 34 | time.sleep(1) 35 | response = client.get_document_text_detection(JobId=job_id) 36 | pages.append(response) 37 | print("Resultset page received: {}".format(len(pages))) 38 | next_token = None 39 | if 'NextToken' in response: 40 | next_token = response['NextToken'] 41 | 42 | while next_token: 43 | time.sleep(1) 44 | response = client.\ 45 | get_document_text_detection(JobId=job_id, NextToken=next_token) 46 | pages.append(response) 47 | print("Resultset page received: {}".format(len(pages))) 48 | next_token = None 49 | if 'NextToken' in response: 50 | next_token = response['NextToken'] 51 | 52 | return pages 53 | 54 | 55 | if __name__ == "__main__": 56 | # Document 57 | s3_bucket_name = "ki-textract-demo-docs" 58 | document_name = "Amazon-Textract-Pdf.pdf" 59 | region = "us-east-1" 60 | client = boto3.client('textract', region_name=region) 61 | 62 | job_id = start_job(client, s3_bucket_name, document_name) 63 | print("Started job with id: {}".format(job_id)) 64 | if is_job_complete(client, job_id): 65 | response = get_job_results(client, job_id) 66 | 67 | # print(response) 68 | 69 | # Print detected text 70 | for result_page in response: 71 | for item in result_page["Blocks"]: 72 | if item["BlockType"] == "LINE": 73 | print('\033[94m' + item["Text"] + '\033[0m') 74 | -------------------------------------------------------------------------------- /python/13-signature.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | from trp import Document 4 | from tabulate import tabulate 5 | 6 | #create a Textract Client 7 | textract = boto3.client('textract') 8 | #Document 9 | documentName = image_filename 10 | 11 | response = None 12 | with open(image_filename, 'rb') as document: 13 | imageBytes = bytearray(document.read()) 14 | 15 | # Call Textract AnalyzeDocument by passing a document from local disk 16 | response = textract.analyze_document( 17 | Document={'Bytes': imageBytes}, 18 | FeatureTypes=["FORMS",'SIGNATURES'] 19 | ) 20 | 21 | #print detected text 22 | d = [] 23 | for item in response["Blocks"]: 24 | if item["BlockType"] == "SIGNATURE": 25 | d.append([item["Id"],item["Geometry"]]) 26 | 27 | print(tabulate(d, headers=["Id", "Geometry"],tablefmt="grid", maxcolwidths= [None,100])) 28 | 29 | 30 | doc = Document(response) 31 | d = [] 32 | 33 | for page in doc.pages: 34 | # Search fields by key 35 | print("\nSearch Fields:") 36 | key = "Signature" 37 | fields = page.form.searchFieldsByKey(key) 38 | for field in fields: 39 | d.append([field.key, field.value]) 40 | 41 | print(tabulate(d, headers=["Key", "Value"])) 42 | -------------------------------------------------------------------------------- /python/Amazon-Textract-Pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/Amazon-Textract-Pdf.pdf -------------------------------------------------------------------------------- /python/OneKeyValue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/OneKeyValue.png -------------------------------------------------------------------------------- /python/OneLine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/OneLine.png -------------------------------------------------------------------------------- /python/Textract-Analyze-ID.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "602739d2", 6 | "metadata": {}, 7 | "source": [ 8 | "### Amazon Textract Analyze ID\n", 9 | "\n", 10 | "Amazon Textract Analyze ID will help you automatically extract information from identification documents, such as driver’s licenses and passports. Amazon Textract uses AI and ML technologies to extract information from identity documents, such as U.S. passports and driver’s licenses, without the need for templates or configuration. You can automatically extract specific information, such as date of expiry and date of birth, as well as intelligently identify and extract implied information, such as name and address." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "f1cc2940", 16 | "metadata": {}, 17 | "source": [ 18 | "Installing the caller to simplify calling Analyze ID" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "id": "107b34fb", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "!python -m pip install -q amazon-textract-caller --upgrade" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "10c0b980", 34 | "metadata": {}, 35 | "source": [ 36 | "Also upgrade boto3 to make sure we are on the latest boto3 that includes Analzye ID" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "id": "cc280ce2", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "!python -m pip install -q boto3 botocore --upgrade" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "id": "cd3d8238", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "import boto3\n", 57 | "import botocore\n", 58 | "from textractcaller import call_textract_analyzeid" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "5cb62607", 64 | "metadata": {}, 65 | "source": [ 66 | "The sample drivers license image is located in an S3 bucket in us-east-2, so we pass in that region to the boto3 client" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "id": "f85fc212", 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "textract_client = boto3.client('textract', region_name='us-east-2')\n", 77 | "j = call_textract_analyzeid(document_pages=[\"s3://amazon-textract-public-content/analyzeid/driverlicense.png\"], \n", 78 | " boto3_textract_client=textract_client)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "ee7dc4e8", 84 | "metadata": {}, 85 | "source": [ 86 | "printing out the JSON response" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "id": "d5417d43", 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "{\n", 100 | " \"IdentityDocuments\": [\n", 101 | " {\n", 102 | " \"DocumentIndex\": 1,\n", 103 | " \"IdentityDocumentFields\": [\n", 104 | " {\n", 105 | " \"Type\": {\n", 106 | " \"Text\": \"FIRST_NAME\"\n", 107 | " },\n", 108 | " \"ValueDetection\": {\n", 109 | " \"Text\": \"JORGE\",\n", 110 | " \"Confidence\": 98.78211975097656\n", 111 | " }\n", 112 | " },\n", 113 | " {\n", 114 | " \"Type\": {\n", 115 | " \"Text\": \"LAST_NAME\"\n", 116 | " },\n", 117 | " \"ValueDetection\": {\n", 118 | " \"Text\": \"SOUZA\",\n", 119 | " \"Confidence\": 98.82009887695312\n", 120 | " }\n", 121 | " },\n", 122 | " {\n", 123 | " \"Type\": {\n", 124 | " \"Text\": \"MIDDLE_NAME\"\n", 125 | " },\n", 126 | " \"ValueDetection\": {\n", 127 | " \"Text\": \"\",\n", 128 | " \"Confidence\": 99.39620208740234\n", 129 | " }\n", 130 | " },\n", 131 | " {\n", 132 | " \"Type\": {\n", 133 | " \"Text\": \"SUFFIX\"\n", 134 | " },\n", 135 | " \"ValueDetection\": {\n", 136 | " \"Text\": \"\",\n", 137 | " \"Confidence\": 99.65946960449219\n", 138 | " }\n", 139 | " },\n", 140 | " {\n", 141 | " \"Type\": {\n", 142 | " \"Text\": \"CITY_IN_ADDRESS\"\n", 143 | " },\n", 144 | " \"ValueDetection\": {\n", 145 | " \"Text\": \"ANYTOWN\",\n", 146 | " \"Confidence\": 98.8210220336914\n", 147 | " }\n", 148 | " },\n", 149 | " {\n", 150 | " \"Type\": {\n", 151 | " \"Text\": \"ZIP_CODE_IN_ADDRESS\"\n", 152 | " },\n", 153 | " \"ValueDetection\": {\n", 154 | " \"Text\": \"02127\",\n", 155 | " \"Confidence\": 99.0246353149414\n", 156 | " }\n", 157 | " },\n", 158 | " {\n", 159 | " \"Type\": {\n", 160 | " \"Text\": \"STATE_IN_ADDRESS\"\n", 161 | " },\n", 162 | " \"ValueDetection\": {\n", 163 | " \"Text\": \"MA\",\n", 164 | " \"Confidence\": 99.53130340576172\n", 165 | " }\n", 166 | " },\n", 167 | " {\n", 168 | " \"Type\": {\n", 169 | " \"Text\": \"STATE_NAME\"\n", 170 | " },\n", 171 | " \"ValueDetection\": {\n", 172 | " \"Text\": \"MASSACHUSETTS\",\n", 173 | " \"Confidence\": 98.22105407714844\n", 174 | " }\n", 175 | " },\n", 176 | " {\n", 177 | " \"Type\": {\n", 178 | " \"Text\": \"DOCUMENT_NUMBER\"\n", 179 | " },\n", 180 | " \"ValueDetection\": {\n", 181 | " \"Text\": \"820BAC729CBAC\",\n", 182 | " \"Confidence\": 96.05117797851562\n", 183 | " }\n", 184 | " },\n", 185 | " {\n", 186 | " \"Type\": {\n", 187 | " \"Text\": \"EXPIRATION_DATE\"\n", 188 | " },\n", 189 | " \"ValueDetection\": {\n", 190 | " \"Text\": \"01/20/2020\",\n", 191 | " \"NormalizedValue\": {\n", 192 | " \"Value\": \"2020-01-20T00:00:00\",\n", 193 | " \"ValueType\": \"Date\"\n", 194 | " },\n", 195 | " \"Confidence\": 98.38336944580078\n", 196 | " }\n", 197 | " },\n", 198 | " {\n", 199 | " \"Type\": {\n", 200 | " \"Text\": \"DATE_OF_BIRTH\"\n", 201 | " },\n", 202 | " \"ValueDetection\": {\n", 203 | " \"Text\": \"03/18/1978\",\n", 204 | " \"NormalizedValue\": {\n", 205 | " \"Value\": \"1978-03-18T00:00:00\",\n", 206 | " \"ValueType\": \"Date\"\n", 207 | " },\n", 208 | " \"Confidence\": 98.17178344726562\n", 209 | " }\n", 210 | " },\n", 211 | " {\n", 212 | " \"Type\": {\n", 213 | " \"Text\": \"DATE_OF_ISSUE\"\n", 214 | " },\n", 215 | " \"ValueDetection\": {\n", 216 | " \"Text\": \"\",\n", 217 | " \"Confidence\": 89.29450988769531\n", 218 | " }\n", 219 | " },\n", 220 | " {\n", 221 | " \"Type\": {\n", 222 | " \"Text\": \"ID_TYPE\"\n", 223 | " },\n", 224 | " \"ValueDetection\": {\n", 225 | " \"Text\": \"DRIVER LICENSE FRONT\",\n", 226 | " \"Confidence\": 98.81443786621094\n", 227 | " }\n", 228 | " },\n", 229 | " {\n", 230 | " \"Type\": {\n", 231 | " \"Text\": \"ENDORSEMENTS\"\n", 232 | " },\n", 233 | " \"ValueDetection\": {\n", 234 | " \"Text\": \"NONE\",\n", 235 | " \"Confidence\": 99.27168273925781\n", 236 | " }\n", 237 | " },\n", 238 | " {\n", 239 | " \"Type\": {\n", 240 | " \"Text\": \"VETERAN\"\n", 241 | " },\n", 242 | " \"ValueDetection\": {\n", 243 | " \"Text\": \"\",\n", 244 | " \"Confidence\": 99.62979125976562\n", 245 | " }\n", 246 | " },\n", 247 | " {\n", 248 | " \"Type\": {\n", 249 | " \"Text\": \"RESTRICTIONS\"\n", 250 | " },\n", 251 | " \"ValueDetection\": {\n", 252 | " \"Text\": \"NONE\",\n", 253 | " \"Confidence\": 99.41033935546875\n", 254 | " }\n", 255 | " },\n", 256 | " {\n", 257 | " \"Type\": {\n", 258 | " \"Text\": \"CLASS\"\n", 259 | " },\n", 260 | " \"ValueDetection\": {\n", 261 | " \"Text\": \"D\",\n", 262 | " \"Confidence\": 99.05763244628906\n", 263 | " }\n", 264 | " },\n", 265 | " {\n", 266 | " \"Type\": {\n", 267 | " \"Text\": \"ADDRESS\"\n", 268 | " },\n", 269 | " \"ValueDetection\": {\n", 270 | " \"Text\": \"100 MAIN STREET\",\n", 271 | " \"Confidence\": 99.24053192138672\n", 272 | " }\n", 273 | " },\n", 274 | " {\n", 275 | " \"Type\": {\n", 276 | " \"Text\": \"COUNTY\"\n", 277 | " },\n", 278 | " \"ValueDetection\": {\n", 279 | " \"Text\": \"\",\n", 280 | " \"Confidence\": 99.59503173828125\n", 281 | " }\n", 282 | " },\n", 283 | " {\n", 284 | " \"Type\": {\n", 285 | " \"Text\": \"PLACE_OF_BIRTH\"\n", 286 | " },\n", 287 | " \"ValueDetection\": {\n", 288 | " \"Text\": \"\",\n", 289 | " \"Confidence\": 99.64707946777344\n", 290 | " }\n", 291 | " }\n", 292 | " ]\n", 293 | " }\n", 294 | " ],\n", 295 | " \"DocumentMetadata\": {\n", 296 | " \"Pages\": 1\n", 297 | " },\n", 298 | " \"AnalyzeIDModelVersion\": \"1.0\",\n", 299 | " \"ResponseMetadata\": {\n", 300 | " \"RequestId\": \"e7437df8-5c35-47a3-a24d-ee8436f18d1d\",\n", 301 | " \"HTTPStatusCode\": 200,\n", 302 | " \"HTTPHeaders\": {\n", 303 | " \"x-amzn-requestid\": \"e7437df8-5c35-47a3-a24d-ee8436f18d1d\",\n", 304 | " \"content-type\": \"application/x-amz-json-1.1\",\n", 305 | " \"content-length\": \"2223\",\n", 306 | " \"date\": \"Fri, 03 Dec 2021 18:56:24 GMT\"\n", 307 | " },\n", 308 | " \"RetryAttempts\": 0\n", 309 | " }\n", 310 | "}\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "import json\n", 316 | "print(json.dumps(j, indent=2))" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "id": "c2a00b42", 322 | "metadata": {}, 323 | "source": [ 324 | "Textract Response Parser makes it easier to get values from the JSON response" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 7, 330 | "id": "e8947a56", 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "!python -m pip install -q amazon-textract-response-parser tabulate --upgrade" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "id": "a2f8e820", 340 | "metadata": {}, 341 | "source": [ 342 | "The get_values_as_list() function returns the values as a list of list of str in the following format\n", 343 | "[[\"doc_number\", \"type\", \"value\", \"confidence\", \"normalized_value\", \"normalized_value_type\"]]\n" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 11, 349 | "id": "e4b8c205", 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/plain": [ 355 | "[['1', 'FIRST_NAME', 'JORGE', '98.78211975097656', '', ''],\n", 356 | " ['1', 'LAST_NAME', 'SOUZA', '98.82009887695312', '', ''],\n", 357 | " ['1', 'MIDDLE_NAME', '', '99.39620208740234', '', ''],\n", 358 | " ['1', 'SUFFIX', '', '99.65946960449219', '', ''],\n", 359 | " ['1', 'CITY_IN_ADDRESS', 'ANYTOWN', '98.8210220336914', '', ''],\n", 360 | " ['1', 'ZIP_CODE_IN_ADDRESS', '02127', '99.0246353149414', '', ''],\n", 361 | " ['1', 'STATE_IN_ADDRESS', 'MA', '99.53130340576172', '', ''],\n", 362 | " ['1', 'STATE_NAME', 'MASSACHUSETTS', '98.22105407714844', '', ''],\n", 363 | " ['1', 'DOCUMENT_NUMBER', '820BAC729CBAC', '96.05117797851562', '', ''],\n", 364 | " ['1',\n", 365 | " 'EXPIRATION_DATE',\n", 366 | " '01/20/2020',\n", 367 | " '98.38336944580078',\n", 368 | " '2020-01-20T00:00:00',\n", 369 | " 'Date'],\n", 370 | " ['1',\n", 371 | " 'DATE_OF_BIRTH',\n", 372 | " '03/18/1978',\n", 373 | " '98.17178344726562',\n", 374 | " '1978-03-18T00:00:00',\n", 375 | " 'Date'],\n", 376 | " ['1', 'DATE_OF_ISSUE', '', '89.29450988769531', '', ''],\n", 377 | " ['1', 'ID_TYPE', 'DRIVER LICENSE FRONT', '98.81443786621094', '', ''],\n", 378 | " ['1', 'ENDORSEMENTS', 'NONE', '99.27168273925781', '', ''],\n", 379 | " ['1', 'VETERAN', '', '99.62979125976562', '', ''],\n", 380 | " ['1', 'RESTRICTIONS', 'NONE', '99.41033935546875', '', ''],\n", 381 | " ['1', 'CLASS', 'D', '99.05763244628906', '', ''],\n", 382 | " ['1', 'ADDRESS', '100 MAIN STREET', '99.24053192138672', '', ''],\n", 383 | " ['1', 'COUNTY', '', '99.59503173828125', '', ''],\n", 384 | " ['1', 'PLACE_OF_BIRTH', '', '99.64707946777344', '', '']]" 385 | ] 386 | }, 387 | "execution_count": 11, 388 | "metadata": {}, 389 | "output_type": "execute_result" 390 | } 391 | ], 392 | "source": [ 393 | "import trp.trp2_analyzeid as t2id\n", 394 | "\n", 395 | "doc: t2id.TAnalyzeIdDocument = t2id.TAnalyzeIdDocumentSchema().load(j)\n", 396 | "result = doc.get_values_as_list()\n", 397 | "result" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "id": "cb963302", 403 | "metadata": {}, 404 | "source": [ 405 | "using tablulate we get a pretty printed output" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 13, 411 | "id": "6c4fdfef", 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "name": "stdout", 416 | "output_type": "stream", 417 | "text": [ 418 | "------------------- --------------------\n", 419 | "FIRST_NAME JORGE\n", 420 | "LAST_NAME SOUZA\n", 421 | "MIDDLE_NAME\n", 422 | "SUFFIX\n", 423 | "CITY_IN_ADDRESS ANYTOWN\n", 424 | "ZIP_CODE_IN_ADDRESS 02127\n", 425 | "STATE_IN_ADDRESS MA\n", 426 | "STATE_NAME MASSACHUSETTS\n", 427 | "DOCUMENT_NUMBER 820BAC729CBAC\n", 428 | "EXPIRATION_DATE 01/20/2020\n", 429 | "DATE_OF_BIRTH 03/18/1978\n", 430 | "DATE_OF_ISSUE\n", 431 | "ID_TYPE DRIVER LICENSE FRONT\n", 432 | "ENDORSEMENTS NONE\n", 433 | "VETERAN\n", 434 | "RESTRICTIONS NONE\n", 435 | "CLASS D\n", 436 | "ADDRESS 100 MAIN STREET\n", 437 | "COUNTY\n", 438 | "PLACE_OF_BIRTH\n", 439 | "------------------- --------------------\n" 440 | ] 441 | } 442 | ], 443 | "source": [ 444 | "from tabulate import tabulate\n", 445 | "print(tabulate([x[1:3] for x in result]))" 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "id": "2d09fc61", 451 | "metadata": {}, 452 | "source": [ 453 | "Just getting the FIRST_NAME" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 14, 459 | "id": "3730f49e", 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "data": { 464 | "text/plain": [ 465 | "['JORGE']" 466 | ] 467 | }, 468 | "execution_count": 14, 469 | "metadata": {}, 470 | "output_type": "execute_result" 471 | } 472 | ], 473 | "source": [ 474 | "[x[2] for x in result if x[1]=='FIRST_NAME']" 475 | ] 476 | } 477 | ], 478 | "metadata": { 479 | "kernelspec": { 480 | "display_name": "Python 3 (ipykernel)", 481 | "language": "python", 482 | "name": "python3" 483 | }, 484 | "language_info": { 485 | "codemirror_mode": { 486 | "name": "ipython", 487 | "version": 3 488 | }, 489 | "file_extension": ".py", 490 | "mimetype": "text/x-python", 491 | "name": "python", 492 | "nbconvert_exporter": "python", 493 | "pygments_lexer": "ipython3", 494 | "version": "3.9.6" 495 | } 496 | }, 497 | "nbformat": 4, 498 | "nbformat_minor": 5 499 | } 500 | -------------------------------------------------------------------------------- /python/Textract-MergeCell-Statement.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/Textract-MergeCell-Statement.pdf -------------------------------------------------------------------------------- /python/Textract-Table-Merged-Cells-And-Headers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4c2f249b", 6 | "metadata": {}, 7 | "source": [ 8 | "## Detecting Merged Cells And Headers on fictitious bank statement" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "5b16cdb3", 14 | "metadata": {}, 15 | "source": [ 16 | "We will be using the modules below:\n", 17 | "* amazon-textract-caller (https://pypi.org/project/amazon-textract-caller/) to invoke Amazon Textract API on our behalf\n", 18 | "* amazon-textract-response-parser (http://%28https//pypi.org/project/amazon-textract-response-parser/) to parse the response payload\n", 19 | "* amazoon-textract-prettyprinter (https://pypi.org/project/amazon-textract-prettyprinter/) to \"pretty-print\" tables" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "a6e6c072", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "!pip install boto3\n", 30 | "!pip install amazon-textract-caller\n", 31 | "!pip install amazon-textract-prettyprinter\n", 32 | "!pip install trp" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "2bd23c9d", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import boto3\n", 43 | "import json\n", 44 | "import pandas as pd\n", 45 | "from textractcaller import call_textract, Textract_Features\n", 46 | "from textractprettyprinter.t_pretty_print import Pretty_Print_Table_Format, Textract_Pretty_Print, get_string, get_tables_string\n", 47 | "from trp import Document\n", 48 | "from trp.trp2 import TDocument, TDocumentSchema\n", 49 | "from trp.t_pipeline import order_blocks_by_geo\n", 50 | "from IPython.display import display" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "f40c07e6", 56 | "metadata": {}, 57 | "source": [ 58 | "Let's initialize the boto3 session and then invoke textract_caller to perform the document processing API call and collect the response back on our behalf." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "34479535", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "session = boto3.Session(profile_name='')\n", 69 | "documentName = \"s3://textract-table-merged-cells-data-sample/Textract-MergeCell-Statement.pdf\"\n", 70 | "textract_json = call_textract(input_document=documentName, features = [Textract_Features.TABLES])" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "4dd698d3", 76 | "metadata": {}, 77 | "source": [ 78 | "Let's pretty-print the response payload. As you can see, by default the date is not populated across all rows." 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "3e0fb7fb", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "print(get_string(textract_json=textract_json, output_type=[Textract_Pretty_Print.TABLES]))" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "id": "36cedb05", 94 | "metadata": {}, 95 | "source": [ 96 | "Now let's load the response into an ordered document and scan the statement's table." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "50846117", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "t_doc = TDocumentSchema().load(textract_json)\n", 107 | "ordered_doc = order_blocks_by_geo(t_doc)\n", 108 | "trp_doc = Document(TDocumentSchema().dump(ordered_doc))\n", 109 | "\n", 110 | "table_index = 1\n", 111 | "dataframes = []\n", 112 | "\n", 113 | "def combine_headers(top_h, bottom_h):\n", 114 | " bottom_h[3] = top_h[2] + \" \" + bottom_h[3]\n", 115 | " bottom_h[4] = top_h[2] + \" \" + bottom_h[4]\n", 116 | "\n", 117 | "for page in trp_doc.pages:\n", 118 | " for table in page.tables:\n", 119 | " table_data = []\n", 120 | " headers = table.get_header_field_names()\n", 121 | " if(len(headers)>0): #Let's retain the only table with headers\n", 122 | " print(\"Statememt headers: \"+ repr(headers))\n", 123 | " top_header= headers[0]\n", 124 | " bottom_header = headers[1]\n", 125 | " combine_headers(top_header, bottom_header) #The statement has two headers. let's combine them\n", 126 | " for r, row in enumerate(table.rows_without_header): #New Table attribute returning rows without headers\n", 127 | " table_data.append([])\n", 128 | " for c, cell in enumerate(row.cells):\n", 129 | " table_data[r].append(cell.mergedText) #New Cell attribute returning merged cells common values\n", 130 | " \n", 131 | " if len(table_data)>0:\n", 132 | " df = pd.DataFrame(table_data, columns=bottom_header)\n", 133 | "\n", 134 | "df" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "77d05bcd", 140 | "metadata": {}, 141 | "source": [ 142 | "Now we can even use multi level indexing and reproduce the table's initial structure." 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "e21b6df1", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "multi = df.set_index(['Date', 'Details'])\n", 153 | "display(multi)" 154 | ] 155 | } 156 | ], 157 | "metadata": { 158 | "kernelspec": { 159 | "display_name": "Python 3 (ipykernel)", 160 | "language": "python", 161 | "name": "python3" 162 | }, 163 | "language_info": { 164 | "codemirror_mode": { 165 | "name": "ipython", 166 | "version": 3 167 | }, 168 | "file_extension": ".py", 169 | "mimetype": "text/x-python", 170 | "name": "python", 171 | "nbconvert_exporter": "python", 172 | "pygments_lexer": "ipython3", 173 | "version": "3.9.7" 174 | } 175 | }, 176 | "nbformat": 4, 177 | "nbformat_minor": 5 178 | } 179 | -------------------------------------------------------------------------------- /python/custom-queries/custom-queries-checks-blog.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "80051b34-8b9b-4b00-845c-c67ed070dc91", 6 | "metadata": {}, 7 | "source": [ 8 | "# Customer Queries Launch Blogpost: Checks processing\n", 9 | "This notebook will walk you through how to annotate and train Custom Queries.\n", 10 | "1. Option 1: Creating an adapter via the console \n", 11 | " This walkthrough covers the process of creating an adapter and then copying pre-annotated check samples to fast-track your testing.\n", 12 | "2. Option 2: Creating an adapter programmtically via the API \n", 13 | " This is identical to option 1, however, uses python boto3 to programmatically create a Custom Adapter and use it for testing." 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "0a4076bf-f2f9-40de-83d8-2b2b46d60eb8", 19 | "metadata": { 20 | "tags": [] 21 | }, 22 | "source": [ 23 | "## Option 1. Create an adapter via the console and copy pre-annotated check samples\n", 24 | "Refer to the [Custom Queries Tutorial](https://docs.aws.amazon.com/textract/latest/dg/textract-adapters-tutorial.html) if you want to upload your own documents and annotate them." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "aa029227-ae38-4ffc-b43b-6597929e0355", 30 | "metadata": {}, 31 | "source": [ 32 | "### Step 1.1: Create an adapter via console\n", 33 | "Navigate to the Textract console → Click on the Custom Queries button located in the sidebar → Click the Create Adapter button\n", 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "69db73b8-deea-4a69-8812-f91a6186da54", 40 | "metadata": {}, 41 | "source": [ 42 | "### Step 1.2: Copy the adapter ID and dataset S3 bucket location from Adapter Details page.\n", 43 | "" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "9c6a8f6d-6c4d-404c-9ba5-764f90bed7f3", 50 | "metadata": { 51 | "tags": [] 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "adapter_id=\"111111111111\"\n", 56 | "dataset_s3_bucket=\"textract-adapters-us-east-1-1111\"" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "504bf278-28dc-4899-bfc2-68618a54a96e", 62 | "metadata": {}, 63 | "source": [ 64 | "### Step 1.3: Update the manifest file with the adapter details\n", 65 | "Run the below cell to programmatically extract the pre-annotations and update the manifest file with your adapter ID.\n", 66 | "You will see a new folder created named as your adapter Id." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "id": "75d97d06-042f-46a3-81ae-2b3357ebea1f", 73 | "metadata": { 74 | "tags": [] 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "import shutil\n", 79 | "shutil.unpack_archive(\"./samples/checks-annotations.zip\", extract_dir=adapter_id)\n", 80 | "print(f\"Check samples archive extracted successfully to folder {adapter_id}\")\n", 81 | "\n", 82 | "!sed -i -e \"s//$dataset_s3_bucket/g;s//$adapter_id/g\" \"./$adapter_id/checks-annotations/manifest.jsonl\"\n", 83 | "print(f\"Replaced all instances of the adapter ID with {adapter_id} and S3 BUCKET with {dataset_s3_bucket}\")" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "id": "38f46be5-68d1-42d0-9628-913ece3cd521", 89 | "metadata": { 90 | "tags": [] 91 | }, 92 | "source": [ 93 | "### Step 1.4: Copy the pre-annotations to the data set location" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "0a8c337c-32d1-4b41-8397-de541e5131c9", 100 | "metadata": { 101 | "tags": [] 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "!aws s3 cp \"./$adapter_id/checks-annotations\" \"s3://$dataset_s3_bucket/adapters/$adapter_id\" --recursive\n", 106 | "print(\"\\nSuccessfully copied all files\")" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "33991daf-71de-4cf2-9ea4-9bd23bf5aa92", 112 | "metadata": { 113 | "tags": [] 114 | }, 115 | "source": [ 116 | "### Step 1.5: Refresh the adapter details page\n", 117 | "Return back to the Textract console and refresh the adapter details page. You should see the following\n", 118 | "1. The dataset is created successfully\n", 119 | "2. Queries have been created\n", 120 | "3. Documents have been verified\n", 121 | "\n", 122 | "Note: if you cannot see your adapter updated like the screenshot below, please check if the adapter ID and S3 bucket you entered in Step 2 is correct.\n", 123 | "\n", 124 | "" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "77694d2a-88b1-432c-a3c8-e2f2fcfabc56", 130 | "metadata": {}, 131 | "source": [ 132 | "### Step 1.6: View the pre-annotated samples \n", 133 | "Click on the Verify Documents button to open the dataset page. Once open, select the files and click review annotations.\n", 134 | "" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "2edde5a3-6254-475f-8354-c6055f2adccd", 140 | "metadata": {}, 141 | "source": [ 142 | "### Step 1.7: Train the Adapter\n", 143 | "Click on the Train Adapter button to initiate training. Training can take 1 to 30 hours to complete, however, given our dataset is small, it should complete in an hour or so.\n", 144 | "" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "34faecaf-0568-428d-b18f-fa0ab51f3298", 150 | "metadata": {}, 151 | "source": [ 152 | "### Step 1.8: Evaluate the adapter (console)\n", 153 | "Once the training completes, click the Evaluate Adapter button on the Adapter Details page to review the adapter performance metrics. \n", 154 | "You can also test samples in the console by clicking on the Try Adapter button and uploading a sample document.\n", 155 | "" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "id": "a7eed7da-6134-47f8-a0e7-ec83c3573e39", 161 | "metadata": {}, 162 | "source": [ 163 | "### Step 1.9: Test the adapter programmatically (API)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "bd77ef99-512e-4d0f-9a9a-5790f1b41ec7", 170 | "metadata": { 171 | "tags": [] 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "from IPython.display import Image\n", 176 | "\n", 177 | "document_name = f\"{adapter_id}/checks-annotations/original_assets/31eb3f65-babd-4410-b9ea-596c7b35989d.jpg\"\n", 178 | "Image(filename=document_name) " 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "id": "a215913b-8647-47a9-bd29-c756c372f2b3", 185 | "metadata": { 186 | "tags": [] 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "!python -m pip install amazon-textract-caller --upgrade\n", 191 | "!python -m pip install amazon-textract-response-parser --upgrade" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "id": "00c7d85d-11fb-4178-9114-319a922f2f1d", 198 | "metadata": { 199 | "tags": [] 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "import boto3\n", 204 | "from textractcaller.t_call import call_textract, Textract_Features, Query, QueriesConfig, Adapter, AdaptersConfig\n", 205 | "import trp.trp2 as t2\n", 206 | "import pandas as pd\n", 207 | "\n", 208 | "textract_client = boto3.client('textract')\n", 209 | "\n", 210 | "def tabulate_query_answers(textract_json):\n", 211 | " d = t2.TDocumentSchema().load(textract_json)\n", 212 | " for page in d.pages:\n", 213 | " query_answers = d.get_query_answers(page=page)\n", 214 | " display(pd.DataFrame(query_answers))\n", 215 | "\n", 216 | "queries = []\n", 217 | "queries.append(Query(text=\"What is the check#?\", alias=\"CHECK_NUMBER\", pages=[\"*\"]))\n", 218 | "queries.append(Query(text=\"What is the date?\", alias=\"DATE\", pages=[\"*\"]))\n", 219 | "queries.append(Query(text=\"What is the check amount in words?\", alias=\"CHECK_AMOUNT_WORDS\", pages=[\"*\"]))\n", 220 | "queries.append(Query(text=\"What is the dollar amount?\", alias=\"DOLLAR_AMOUNT\", pages=[\"*\"]))\n", 221 | "queries.append(Query(text=\"Who is the payee?\", alias=\"PAYEE_NAME\", pages=[\"*\"]))\n", 222 | "queries.append(Query(text=\"What is the customer account#\", alias=\"ACCOUNT_NUMBER\", pages=[\"*\"]))\n", 223 | "queries.append(Query(text=\"what is the payee address?\", alias=\"PAYEE_ADDRESS\", pages=[\"*\"]))\n", 224 | "queries.append(Query(text=\"What is the bank routing number?\", alias=\"BANK_ROUTING_NUMBER\", pages=[\"*\"]))\n", 225 | "queries.append(Query(text=\"What is the memo\", alias=\"MEMO\", pages=[\"*\"]))\n", 226 | "queries.append(Query(text=\"What is the account name/payer/drawer name?\", alias=\"ACCOUNT_NAME\", pages=[\"*\"]))\n", 227 | "queries.append(Query(text=\"What is the bank name/drawee name?\", alias=\"BANK_NAME\", pages=[\"*\"]))\n", 228 | "queries.append(Query(text=\"What is the MICR line?\", alias=\"MICR_LINE\", pages=[\"*\"]))\n", 229 | "\n", 230 | "\n", 231 | "queries_config = QueriesConfig(queries=queries)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "id": "6055c0fa-66ad-4923-9f51-1cedf53df6bf", 238 | "metadata": { 239 | "tags": [] 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "print(\"Calling Pre-built Textract Queries\")\n", 244 | "\n", 245 | "textract_json_prebuilt = call_textract(input_document=document_name,\n", 246 | " boto3_textract_client=textract_client,\n", 247 | " features=[Textract_Features.QUERIES],\n", 248 | " queries_config=queries_config)\n", 249 | "\n", 250 | "tabulate_query_answers(textract_json_prebuilt)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "id": "4bb64fd4-35f5-43a5-bba6-c98912de6d90", 257 | "metadata": { 258 | "tags": [] 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "adapter1 = Adapter(adapter_id=adapter_id, version=\"1\", pages=[\"*\"])\n", 263 | "adapters_config = AdaptersConfig(adapters=[adapter1])\n", 264 | "print(f\"Calling Custom Queries with Adapter:{adapter_id}\")\n", 265 | "\n", 266 | "textract_json_with_adapter = call_textract(input_document=document_name,\n", 267 | " boto3_textract_client=textract_client,\n", 268 | " features=[Textract_Features.QUERIES],\n", 269 | " queries_config=queries_config,\n", 270 | " adapters_config=adapters_config)\n", 271 | "\n", 272 | "tabulate_query_answers(textract_json_with_adapter)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "id": "11215483-730e-4b87-b926-9ff7422a72b8", 278 | "metadata": {}, 279 | "source": [ 280 | "## Option 2. Create an adapter programmatically via the API\n", 281 | "We use the Textract Boto3 client to create an adapter. See [Textract boto3 documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/textract.html) for details. \n", 282 | "Alternately, you can use the CLI or a language of your choice. See \n", 283 | "[CLI Documentation](https://docs.aws.amazon.com/textract/latest/dg/textract-create-adapter.html) for details." 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "id": "0b78fde3-c325-4c07-a41c-c409f47b91aa", 289 | "metadata": {}, 290 | "source": [ 291 | "### Step 2.1: Create an adapter using the CreateAdapter API\n", 292 | "1. On calling the CreateAdapter API, the API returns the created AdapterId. We will use this ID in subsequent steps. \n", 293 | "2. We will also use the ListAdapter API to view all the adapters on the AWS Account" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "id": "5fe0aac8-6f4b-41d8-aba5-4ae67d8ad9ca", 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "import boto3\n", 304 | "import pandas as pd\n", 305 | "from IPython.display import display, HTML \n", 306 | "\n", 307 | "textract_client = boto3.client('textract')\n", 308 | "\n", 309 | "response = textract_client.create_adapter(\n", 310 | " AdapterName='checks-adapter-api',\n", 311 | " Description='Adapter for checks processing created via the API',\n", 312 | " FeatureTypes=['QUERIES'],\n", 313 | " AutoUpdate='ENABLED',\n", 314 | " Tags={\n", 315 | " 'project': 'checks-automation'\n", 316 | " }\n", 317 | ")\n", 318 | "\n", 319 | "adapter_id = response[\"AdapterId\"]\n", 320 | "print(f\"Adapter created with adapter id: {adapter_id}\")" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "id": "16dd16ab-9f5f-46e1-aeee-5c2fc80b824f", 327 | "metadata": { 328 | "tags": [] 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "response = textract_client.list_adapters()\n", 333 | "display(pd.DataFrame(response[\"Adapters\"]))" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "id": "9189f533-0fa3-4517-a733-48b8757589d9", 339 | "metadata": {}, 340 | "source": [ 341 | "### Step 2.2: Update and copy the document samples, manifest file and annotations to S3\n", 342 | "1. Provide the S3 bucket where you would like to store the test and train datasets \n", 343 | "2. We copy the manifest file, annotations and samples to the bucket with the newly created Adapter ID as the alias." 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "id": "105e8ee3-1587-4a79-ae46-2042a8d54ad4", 350 | "metadata": { 351 | "tags": [] 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "dataset_s3_bucket = \"enter-s3-bucket\"\n", 356 | "\n", 357 | "# We use the same bucket for the output as the dataset bucket, with a different prefix. You can change this as required\n", 358 | "output_s3_bucket = dataset_s3_bucket" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "id": "9417c052-d188-4e16-8250-6dc2069407a4", 365 | "metadata": { 366 | "tags": [] 367 | }, 368 | "outputs": [], 369 | "source": [ 370 | "import shutil\n", 371 | "shutil.unpack_archive(\"./samples/checks-annotations.zip\", extract_dir=adapter_id)\n", 372 | "print(f\"Check samples archive extracted successfully to folder {adapter_id}\")\n", 373 | "\n", 374 | "!sed -i -e \"s//$dataset_s3_bucket/g;s//$adapter_id/g\" \"./$adapter_id/checks-annotations/manifest.jsonl\"\n", 375 | "print(f\"Replaced all instances of the adapter ID with {adapter_id} and S3 BUCKET with {dataset_s3_bucket}\")" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "id": "87575166-5842-4d34-8141-cba15d0f81be", 382 | "metadata": { 383 | "tags": [] 384 | }, 385 | "outputs": [], 386 | "source": [ 387 | "!aws s3 cp \"./$adapter_id/checks-annotations\" \"s3://$dataset_s3_bucket/adapters/$adapter_id\" --recursive\n", 388 | "print(\"\\nSuccessfully copied all files\")" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "id": "ab09eaef-d7ca-44ad-9813-ac6f77179cd4", 394 | "metadata": {}, 395 | "source": [ 396 | "### Step 2.3: Begin training the Adapter by calling CreateAdapterVersion\n", 397 | "To begin training, we call the CreateAdapterVersion API" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "id": "daab8a79-2d4b-4686-8b7d-d5dac64637c0", 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "manifest_file_name=f\"adapters/{adapter_id}/manifest.jsonl\"\n", 408 | "output_config_prefix=f\"adapters-output/{adapter_id}/\"\n", 409 | "\n", 410 | "response = textract_client.create_adapter_version(\n", 411 | " AdapterId=adapter_id,\n", 412 | " DatasetConfig={\n", 413 | " 'ManifestS3Object': {\n", 414 | " 'Bucket': dataset_s3_bucket,\n", 415 | " 'Name': manifest_file_name\n", 416 | " }\n", 417 | " },\n", 418 | " OutputConfig={\n", 419 | " 'S3Bucket': output_s3_bucket,\n", 420 | " 'S3Prefix': output_config_prefix\n", 421 | " },\n", 422 | " Tags={\n", 423 | " 'project': 'checks-automation'\n", 424 | " }\n", 425 | ")\n", 426 | "\n", 427 | "adapter_version = response[\"AdapterVersion\"]\n", 428 | "print(f\"Started training AdapterVersion: {adapter_version} for AdapterId: {adapter_id}\")" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "id": "6eb56c5f-c3ea-4ce7-b920-29d3c49b7e3c", 434 | "metadata": {}, 435 | "source": [ 436 | "### Step 2.4: List all the adapter versions in your AWS Account\n", 437 | "You will see a new Adapter ID and Version in the list with the Status as \"CREATION_IN_PROGRESS\". \n", 438 | "Training can take 1 to 30 hours to complete, however, given our dataset is small, it should complete in an hour or so." 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "id": "69734f55-7255-42de-bea0-b2736e95f6d3", 445 | "metadata": { 446 | "tags": [] 447 | }, 448 | "outputs": [], 449 | "source": [ 450 | "response = textract_client.list_adapter_versions()\n", 451 | "display(pd.DataFrame(response[\"AdapterVersions\"]))" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "id": "ad80a635-9e86-420d-b839-8f8f1c398077", 457 | "metadata": { 458 | "tags": [] 459 | }, 460 | "source": [ 461 | "### Step 2.5: View details of the adapter you just created using GetAdapterVersion\n", 462 | "1. This provides you with all the details for the adapter - from the dataset and output config to the evaluation metrics. \n", 463 | "2. As the adapter creation and training is still in progress, you will not see **\"EvaluationMetrics\"** yet. Come back once the training is complete" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "id": "d380157e-3f43-4cea-aa6d-7e29883f2d10", 470 | "metadata": { 471 | "tags": [] 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "import json\n", 476 | "response = textract_client.get_adapter_version(\n", 477 | " AdapterId=adapter_id,\n", 478 | " AdapterVersion=adapter_version\n", 479 | ")\n", 480 | "print(json.dumps(response, indent=4, default=str))" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "id": "79b615e1-4264-4c2b-b443-c0bcdc44be70", 486 | "metadata": {}, 487 | "source": [ 488 | "### Step 2.6: Test the adapter programmatically (API)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "id": "ba97ad85-cd72-4886-aae7-e81ebf3eeb98", 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "from IPython.display import Image\n", 499 | "\n", 500 | "document_name = f\"{adapter_id}/checks-annotations/original_assets/31eb3f65-babd-4410-b9ea-596c7b35989d.jpg\"\n", 501 | "Image(filename=document_name) " 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "id": "72ce03a5-9612-4138-b917-59406d0c66c0", 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "import boto3\n", 512 | "from textractcaller.t_call import call_textract, Textract_Features, Query, QueriesConfig, Adapter, AdaptersConfig\n", 513 | "import trp.trp2 as t2\n", 514 | "\n", 515 | "textract_client = boto3.client('textract')\n", 516 | "\n", 517 | "def tabulate_query_answers(textract_json):\n", 518 | " d = t2.TDocumentSchema().load(textract_json)\n", 519 | " for page in d.pages:\n", 520 | " query_answers = d.get_query_answers(page=page)\n", 521 | " display(pd.DataFrame(query_answers))\n", 522 | "\n", 523 | "queries = []\n", 524 | "queries.append(Query(text=\"What is the check#?\", alias=\"CHECK_NUMBER\", pages=[\"*\"]))\n", 525 | "queries.append(Query(text=\"What is the date?\", alias=\"DATE\", pages=[\"*\"]))\n", 526 | "queries.append(Query(text=\"What is the check amount in words?\", alias=\"CHECK_AMOUNT_WORDS\", pages=[\"*\"]))\n", 527 | "queries.append(Query(text=\"What is the dollar amount?\", alias=\"DOLLAR_AMOUNT\", pages=[\"*\"]))\n", 528 | "queries.append(Query(text=\"Who is the payee?\", alias=\"PAYEE_NAME\", pages=[\"*\"]))\n", 529 | "queries.append(Query(text=\"What is the customer account#\", alias=\"ACCOUNT_NUMBER\", pages=[\"*\"]))\n", 530 | "queries.append(Query(text=\"what is the payee address?\", alias=\"PAYEE_ADDRESS\", pages=[\"*\"]))\n", 531 | "queries.append(Query(text=\"What is the bank routing number?\", alias=\"BANK_ROUTING_NUMBER\", pages=[\"*\"]))\n", 532 | "queries.append(Query(text=\"What is the memo\", alias=\"MEMO\", pages=[\"*\"]))\n", 533 | "queries.append(Query(text=\"What is the account name/payer/drawer name?\", alias=\"ACCOUNT_NAME\", pages=[\"*\"]))\n", 534 | "queries.append(Query(text=\"What is the bank name/drawee name?\", alias=\"BANK_NAME\", pages=[\"*\"]))\n", 535 | "queries.append(Query(text=\"What is the MICR line?\", alias=\"MICR_LINE\", pages=[\"*\"]))\n", 536 | "\n", 537 | "\n", 538 | "queries_config = QueriesConfig(queries=queries)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "id": "44d2af5b-57af-4f82-af0b-9efcb4a05767", 545 | "metadata": {}, 546 | "outputs": [], 547 | "source": [ 548 | "print(\"Calling Pre-built Textract Queries\")\n", 549 | "\n", 550 | "textract_json_prebuilt = call_textract(input_document=document_name,\n", 551 | " boto3_textract_client=textract_client,\n", 552 | " features=[Textract_Features.QUERIES],\n", 553 | " queries_config=queries_config)\n", 554 | "\n", 555 | "tabulate_query_answers(textract_json_prebuilt)" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "id": "217106a3-15e8-4913-a8a9-ddc7307290c7", 562 | "metadata": { 563 | "tags": [] 564 | }, 565 | "outputs": [], 566 | "source": [ 567 | "adapter1 = Adapter(adapter_id=adapter_id, version=adapter_version, pages=[\"*\"])\n", 568 | "adapters_config = AdaptersConfig(adapters=[adapter1])\n", 569 | "print(f\"Calling Custom Queries with Adapter: {adapter_id} and AdapterVersion: {adapter_version}\")\n", 570 | "\n", 571 | "textract_json_with_adapter = call_textract(input_document=document_name,\n", 572 | " boto3_textract_client=textract_client,\n", 573 | " features=[Textract_Features.QUERIES],\n", 574 | " queries_config=queries_config,\n", 575 | " adapters_config=adapters_config)\n", 576 | "\n", 577 | "tabulate_query_answers(textract_json_with_adapter)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "id": "4e253825-5e92-4ee9-bda3-4cd005247851", 583 | "metadata": {}, 584 | "source": [ 585 | "### Step 2.7 : Clean-up resources\n", 586 | "You can choose to delete the adapter version or the adapter. \n", 587 | "When deleting the entire adapter, you must delete all adapter versions first and then proceed to delete the adapter. " 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "id": "08ae063e-b992-4816-b475-d39a8595dc65", 594 | "metadata": { 595 | "tags": [] 596 | }, 597 | "outputs": [], 598 | "source": [ 599 | "response = textract_client.delete_adapter_version(\n", 600 | " AdapterId=adapter_id,\n", 601 | " AdapterVersion=adapter_version\n", 602 | ")\n", 603 | "if response[\"ResponseMetadata\"][\"HTTPStatusCode\"] == 200:\n", 604 | " print(f\"Adapter Version: {adapter_version} successfully deleted\")\n", 605 | "\n", 606 | "\n", 607 | "response = textract_client.delete_adapter(\n", 608 | " AdapterId=adapter_id\n", 609 | ")\n", 610 | "if response[\"ResponseMetadata\"][\"HTTPStatusCode\"] == 200:\n", 611 | " print(f\"Adapter ID: {adapter_id} successfully deleted\")\n" 612 | ] 613 | } 614 | ], 615 | "metadata": { 616 | "availableInstances": [ 617 | { 618 | "_defaultOrder": 0, 619 | "_isFastLaunch": true, 620 | "category": "General purpose", 621 | "gpuNum": 0, 622 | "hideHardwareSpecs": false, 623 | "memoryGiB": 4, 624 | "name": "ml.t3.medium", 625 | "vcpuNum": 2 626 | }, 627 | { 628 | "_defaultOrder": 1, 629 | "_isFastLaunch": false, 630 | "category": "General purpose", 631 | "gpuNum": 0, 632 | "hideHardwareSpecs": false, 633 | "memoryGiB": 8, 634 | "name": "ml.t3.large", 635 | "vcpuNum": 2 636 | }, 637 | { 638 | "_defaultOrder": 2, 639 | "_isFastLaunch": false, 640 | "category": "General purpose", 641 | "gpuNum": 0, 642 | "hideHardwareSpecs": false, 643 | "memoryGiB": 16, 644 | "name": "ml.t3.xlarge", 645 | "vcpuNum": 4 646 | }, 647 | { 648 | "_defaultOrder": 3, 649 | "_isFastLaunch": false, 650 | "category": "General purpose", 651 | "gpuNum": 0, 652 | "hideHardwareSpecs": false, 653 | "memoryGiB": 32, 654 | "name": "ml.t3.2xlarge", 655 | "vcpuNum": 8 656 | }, 657 | { 658 | "_defaultOrder": 4, 659 | "_isFastLaunch": true, 660 | "category": "General purpose", 661 | "gpuNum": 0, 662 | "hideHardwareSpecs": false, 663 | "memoryGiB": 8, 664 | "name": "ml.m5.large", 665 | "vcpuNum": 2 666 | }, 667 | { 668 | "_defaultOrder": 5, 669 | "_isFastLaunch": false, 670 | "category": "General purpose", 671 | "gpuNum": 0, 672 | "hideHardwareSpecs": false, 673 | "memoryGiB": 16, 674 | "name": "ml.m5.xlarge", 675 | "vcpuNum": 4 676 | }, 677 | { 678 | "_defaultOrder": 6, 679 | "_isFastLaunch": false, 680 | "category": "General purpose", 681 | "gpuNum": 0, 682 | "hideHardwareSpecs": false, 683 | "memoryGiB": 32, 684 | "name": "ml.m5.2xlarge", 685 | "vcpuNum": 8 686 | }, 687 | { 688 | "_defaultOrder": 7, 689 | "_isFastLaunch": false, 690 | "category": "General purpose", 691 | "gpuNum": 0, 692 | "hideHardwareSpecs": false, 693 | "memoryGiB": 64, 694 | "name": "ml.m5.4xlarge", 695 | "vcpuNum": 16 696 | }, 697 | { 698 | "_defaultOrder": 8, 699 | "_isFastLaunch": false, 700 | "category": "General purpose", 701 | "gpuNum": 0, 702 | "hideHardwareSpecs": false, 703 | "memoryGiB": 128, 704 | "name": "ml.m5.8xlarge", 705 | "vcpuNum": 32 706 | }, 707 | { 708 | "_defaultOrder": 9, 709 | "_isFastLaunch": false, 710 | "category": "General purpose", 711 | "gpuNum": 0, 712 | "hideHardwareSpecs": false, 713 | "memoryGiB": 192, 714 | "name": "ml.m5.12xlarge", 715 | "vcpuNum": 48 716 | }, 717 | { 718 | "_defaultOrder": 10, 719 | "_isFastLaunch": false, 720 | "category": "General purpose", 721 | "gpuNum": 0, 722 | "hideHardwareSpecs": false, 723 | "memoryGiB": 256, 724 | "name": "ml.m5.16xlarge", 725 | "vcpuNum": 64 726 | }, 727 | { 728 | "_defaultOrder": 11, 729 | "_isFastLaunch": false, 730 | "category": "General purpose", 731 | "gpuNum": 0, 732 | "hideHardwareSpecs": false, 733 | "memoryGiB": 384, 734 | "name": "ml.m5.24xlarge", 735 | "vcpuNum": 96 736 | }, 737 | { 738 | "_defaultOrder": 12, 739 | "_isFastLaunch": false, 740 | "category": "General purpose", 741 | "gpuNum": 0, 742 | "hideHardwareSpecs": false, 743 | "memoryGiB": 8, 744 | "name": "ml.m5d.large", 745 | "vcpuNum": 2 746 | }, 747 | { 748 | "_defaultOrder": 13, 749 | "_isFastLaunch": false, 750 | "category": "General purpose", 751 | "gpuNum": 0, 752 | "hideHardwareSpecs": false, 753 | "memoryGiB": 16, 754 | "name": "ml.m5d.xlarge", 755 | "vcpuNum": 4 756 | }, 757 | { 758 | "_defaultOrder": 14, 759 | "_isFastLaunch": false, 760 | "category": "General purpose", 761 | "gpuNum": 0, 762 | "hideHardwareSpecs": false, 763 | "memoryGiB": 32, 764 | "name": "ml.m5d.2xlarge", 765 | "vcpuNum": 8 766 | }, 767 | { 768 | "_defaultOrder": 15, 769 | "_isFastLaunch": false, 770 | "category": "General purpose", 771 | "gpuNum": 0, 772 | "hideHardwareSpecs": false, 773 | "memoryGiB": 64, 774 | "name": "ml.m5d.4xlarge", 775 | "vcpuNum": 16 776 | }, 777 | { 778 | "_defaultOrder": 16, 779 | "_isFastLaunch": false, 780 | "category": "General purpose", 781 | "gpuNum": 0, 782 | "hideHardwareSpecs": false, 783 | "memoryGiB": 128, 784 | "name": "ml.m5d.8xlarge", 785 | "vcpuNum": 32 786 | }, 787 | { 788 | "_defaultOrder": 17, 789 | "_isFastLaunch": false, 790 | "category": "General purpose", 791 | "gpuNum": 0, 792 | "hideHardwareSpecs": false, 793 | "memoryGiB": 192, 794 | "name": "ml.m5d.12xlarge", 795 | "vcpuNum": 48 796 | }, 797 | { 798 | "_defaultOrder": 18, 799 | "_isFastLaunch": false, 800 | "category": "General purpose", 801 | "gpuNum": 0, 802 | "hideHardwareSpecs": false, 803 | "memoryGiB": 256, 804 | "name": "ml.m5d.16xlarge", 805 | "vcpuNum": 64 806 | }, 807 | { 808 | "_defaultOrder": 19, 809 | "_isFastLaunch": false, 810 | "category": "General purpose", 811 | "gpuNum": 0, 812 | "hideHardwareSpecs": false, 813 | "memoryGiB": 384, 814 | "name": "ml.m5d.24xlarge", 815 | "vcpuNum": 96 816 | }, 817 | { 818 | "_defaultOrder": 20, 819 | "_isFastLaunch": false, 820 | "category": "General purpose", 821 | "gpuNum": 0, 822 | "hideHardwareSpecs": true, 823 | "memoryGiB": 0, 824 | "name": "ml.geospatial.interactive", 825 | "supportedImageNames": [ 826 | "sagemaker-geospatial-v1-0" 827 | ], 828 | "vcpuNum": 0 829 | }, 830 | { 831 | "_defaultOrder": 21, 832 | "_isFastLaunch": true, 833 | "category": "Compute optimized", 834 | "gpuNum": 0, 835 | "hideHardwareSpecs": false, 836 | "memoryGiB": 4, 837 | "name": "ml.c5.large", 838 | "vcpuNum": 2 839 | }, 840 | { 841 | "_defaultOrder": 22, 842 | "_isFastLaunch": false, 843 | "category": "Compute optimized", 844 | "gpuNum": 0, 845 | "hideHardwareSpecs": false, 846 | "memoryGiB": 8, 847 | "name": "ml.c5.xlarge", 848 | "vcpuNum": 4 849 | }, 850 | { 851 | "_defaultOrder": 23, 852 | "_isFastLaunch": false, 853 | "category": "Compute optimized", 854 | "gpuNum": 0, 855 | "hideHardwareSpecs": false, 856 | "memoryGiB": 16, 857 | "name": "ml.c5.2xlarge", 858 | "vcpuNum": 8 859 | }, 860 | { 861 | "_defaultOrder": 24, 862 | "_isFastLaunch": false, 863 | "category": "Compute optimized", 864 | "gpuNum": 0, 865 | "hideHardwareSpecs": false, 866 | "memoryGiB": 32, 867 | "name": "ml.c5.4xlarge", 868 | "vcpuNum": 16 869 | }, 870 | { 871 | "_defaultOrder": 25, 872 | "_isFastLaunch": false, 873 | "category": "Compute optimized", 874 | "gpuNum": 0, 875 | "hideHardwareSpecs": false, 876 | "memoryGiB": 72, 877 | "name": "ml.c5.9xlarge", 878 | "vcpuNum": 36 879 | }, 880 | { 881 | "_defaultOrder": 26, 882 | "_isFastLaunch": false, 883 | "category": "Compute optimized", 884 | "gpuNum": 0, 885 | "hideHardwareSpecs": false, 886 | "memoryGiB": 96, 887 | "name": "ml.c5.12xlarge", 888 | "vcpuNum": 48 889 | }, 890 | { 891 | "_defaultOrder": 27, 892 | "_isFastLaunch": false, 893 | "category": "Compute optimized", 894 | "gpuNum": 0, 895 | "hideHardwareSpecs": false, 896 | "memoryGiB": 144, 897 | "name": "ml.c5.18xlarge", 898 | "vcpuNum": 72 899 | }, 900 | { 901 | "_defaultOrder": 28, 902 | "_isFastLaunch": false, 903 | "category": "Compute optimized", 904 | "gpuNum": 0, 905 | "hideHardwareSpecs": false, 906 | "memoryGiB": 192, 907 | "name": "ml.c5.24xlarge", 908 | "vcpuNum": 96 909 | }, 910 | { 911 | "_defaultOrder": 29, 912 | "_isFastLaunch": true, 913 | "category": "Accelerated computing", 914 | "gpuNum": 1, 915 | "hideHardwareSpecs": false, 916 | "memoryGiB": 16, 917 | "name": "ml.g4dn.xlarge", 918 | "vcpuNum": 4 919 | }, 920 | { 921 | "_defaultOrder": 30, 922 | "_isFastLaunch": false, 923 | "category": "Accelerated computing", 924 | "gpuNum": 1, 925 | "hideHardwareSpecs": false, 926 | "memoryGiB": 32, 927 | "name": "ml.g4dn.2xlarge", 928 | "vcpuNum": 8 929 | }, 930 | { 931 | "_defaultOrder": 31, 932 | "_isFastLaunch": false, 933 | "category": "Accelerated computing", 934 | "gpuNum": 1, 935 | "hideHardwareSpecs": false, 936 | "memoryGiB": 64, 937 | "name": "ml.g4dn.4xlarge", 938 | "vcpuNum": 16 939 | }, 940 | { 941 | "_defaultOrder": 32, 942 | "_isFastLaunch": false, 943 | "category": "Accelerated computing", 944 | "gpuNum": 1, 945 | "hideHardwareSpecs": false, 946 | "memoryGiB": 128, 947 | "name": "ml.g4dn.8xlarge", 948 | "vcpuNum": 32 949 | }, 950 | { 951 | "_defaultOrder": 33, 952 | "_isFastLaunch": false, 953 | "category": "Accelerated computing", 954 | "gpuNum": 4, 955 | "hideHardwareSpecs": false, 956 | "memoryGiB": 192, 957 | "name": "ml.g4dn.12xlarge", 958 | "vcpuNum": 48 959 | }, 960 | { 961 | "_defaultOrder": 34, 962 | "_isFastLaunch": false, 963 | "category": "Accelerated computing", 964 | "gpuNum": 1, 965 | "hideHardwareSpecs": false, 966 | "memoryGiB": 256, 967 | "name": "ml.g4dn.16xlarge", 968 | "vcpuNum": 64 969 | }, 970 | { 971 | "_defaultOrder": 35, 972 | "_isFastLaunch": false, 973 | "category": "Accelerated computing", 974 | "gpuNum": 1, 975 | "hideHardwareSpecs": false, 976 | "memoryGiB": 61, 977 | "name": "ml.p3.2xlarge", 978 | "vcpuNum": 8 979 | }, 980 | { 981 | "_defaultOrder": 36, 982 | "_isFastLaunch": false, 983 | "category": "Accelerated computing", 984 | "gpuNum": 4, 985 | "hideHardwareSpecs": false, 986 | "memoryGiB": 244, 987 | "name": "ml.p3.8xlarge", 988 | "vcpuNum": 32 989 | }, 990 | { 991 | "_defaultOrder": 37, 992 | "_isFastLaunch": false, 993 | "category": "Accelerated computing", 994 | "gpuNum": 8, 995 | "hideHardwareSpecs": false, 996 | "memoryGiB": 488, 997 | "name": "ml.p3.16xlarge", 998 | "vcpuNum": 64 999 | }, 1000 | { 1001 | "_defaultOrder": 38, 1002 | "_isFastLaunch": false, 1003 | "category": "Accelerated computing", 1004 | "gpuNum": 8, 1005 | "hideHardwareSpecs": false, 1006 | "memoryGiB": 768, 1007 | "name": "ml.p3dn.24xlarge", 1008 | "vcpuNum": 96 1009 | }, 1010 | { 1011 | "_defaultOrder": 39, 1012 | "_isFastLaunch": false, 1013 | "category": "Memory Optimized", 1014 | "gpuNum": 0, 1015 | "hideHardwareSpecs": false, 1016 | "memoryGiB": 16, 1017 | "name": "ml.r5.large", 1018 | "vcpuNum": 2 1019 | }, 1020 | { 1021 | "_defaultOrder": 40, 1022 | "_isFastLaunch": false, 1023 | "category": "Memory Optimized", 1024 | "gpuNum": 0, 1025 | "hideHardwareSpecs": false, 1026 | "memoryGiB": 32, 1027 | "name": "ml.r5.xlarge", 1028 | "vcpuNum": 4 1029 | }, 1030 | { 1031 | "_defaultOrder": 41, 1032 | "_isFastLaunch": false, 1033 | "category": "Memory Optimized", 1034 | "gpuNum": 0, 1035 | "hideHardwareSpecs": false, 1036 | "memoryGiB": 64, 1037 | "name": "ml.r5.2xlarge", 1038 | "vcpuNum": 8 1039 | }, 1040 | { 1041 | "_defaultOrder": 42, 1042 | "_isFastLaunch": false, 1043 | "category": "Memory Optimized", 1044 | "gpuNum": 0, 1045 | "hideHardwareSpecs": false, 1046 | "memoryGiB": 128, 1047 | "name": "ml.r5.4xlarge", 1048 | "vcpuNum": 16 1049 | }, 1050 | { 1051 | "_defaultOrder": 43, 1052 | "_isFastLaunch": false, 1053 | "category": "Memory Optimized", 1054 | "gpuNum": 0, 1055 | "hideHardwareSpecs": false, 1056 | "memoryGiB": 256, 1057 | "name": "ml.r5.8xlarge", 1058 | "vcpuNum": 32 1059 | }, 1060 | { 1061 | "_defaultOrder": 44, 1062 | "_isFastLaunch": false, 1063 | "category": "Memory Optimized", 1064 | "gpuNum": 0, 1065 | "hideHardwareSpecs": false, 1066 | "memoryGiB": 384, 1067 | "name": "ml.r5.12xlarge", 1068 | "vcpuNum": 48 1069 | }, 1070 | { 1071 | "_defaultOrder": 45, 1072 | "_isFastLaunch": false, 1073 | "category": "Memory Optimized", 1074 | "gpuNum": 0, 1075 | "hideHardwareSpecs": false, 1076 | "memoryGiB": 512, 1077 | "name": "ml.r5.16xlarge", 1078 | "vcpuNum": 64 1079 | }, 1080 | { 1081 | "_defaultOrder": 46, 1082 | "_isFastLaunch": false, 1083 | "category": "Memory Optimized", 1084 | "gpuNum": 0, 1085 | "hideHardwareSpecs": false, 1086 | "memoryGiB": 768, 1087 | "name": "ml.r5.24xlarge", 1088 | "vcpuNum": 96 1089 | }, 1090 | { 1091 | "_defaultOrder": 47, 1092 | "_isFastLaunch": false, 1093 | "category": "Accelerated computing", 1094 | "gpuNum": 1, 1095 | "hideHardwareSpecs": false, 1096 | "memoryGiB": 16, 1097 | "name": "ml.g5.xlarge", 1098 | "vcpuNum": 4 1099 | }, 1100 | { 1101 | "_defaultOrder": 48, 1102 | "_isFastLaunch": false, 1103 | "category": "Accelerated computing", 1104 | "gpuNum": 1, 1105 | "hideHardwareSpecs": false, 1106 | "memoryGiB": 32, 1107 | "name": "ml.g5.2xlarge", 1108 | "vcpuNum": 8 1109 | }, 1110 | { 1111 | "_defaultOrder": 49, 1112 | "_isFastLaunch": false, 1113 | "category": "Accelerated computing", 1114 | "gpuNum": 1, 1115 | "hideHardwareSpecs": false, 1116 | "memoryGiB": 64, 1117 | "name": "ml.g5.4xlarge", 1118 | "vcpuNum": 16 1119 | }, 1120 | { 1121 | "_defaultOrder": 50, 1122 | "_isFastLaunch": false, 1123 | "category": "Accelerated computing", 1124 | "gpuNum": 1, 1125 | "hideHardwareSpecs": false, 1126 | "memoryGiB": 128, 1127 | "name": "ml.g5.8xlarge", 1128 | "vcpuNum": 32 1129 | }, 1130 | { 1131 | "_defaultOrder": 51, 1132 | "_isFastLaunch": false, 1133 | "category": "Accelerated computing", 1134 | "gpuNum": 1, 1135 | "hideHardwareSpecs": false, 1136 | "memoryGiB": 256, 1137 | "name": "ml.g5.16xlarge", 1138 | "vcpuNum": 64 1139 | }, 1140 | { 1141 | "_defaultOrder": 52, 1142 | "_isFastLaunch": false, 1143 | "category": "Accelerated computing", 1144 | "gpuNum": 4, 1145 | "hideHardwareSpecs": false, 1146 | "memoryGiB": 192, 1147 | "name": "ml.g5.12xlarge", 1148 | "vcpuNum": 48 1149 | }, 1150 | { 1151 | "_defaultOrder": 53, 1152 | "_isFastLaunch": false, 1153 | "category": "Accelerated computing", 1154 | "gpuNum": 4, 1155 | "hideHardwareSpecs": false, 1156 | "memoryGiB": 384, 1157 | "name": "ml.g5.24xlarge", 1158 | "vcpuNum": 96 1159 | }, 1160 | { 1161 | "_defaultOrder": 54, 1162 | "_isFastLaunch": false, 1163 | "category": "Accelerated computing", 1164 | "gpuNum": 8, 1165 | "hideHardwareSpecs": false, 1166 | "memoryGiB": 768, 1167 | "name": "ml.g5.48xlarge", 1168 | "vcpuNum": 192 1169 | }, 1170 | { 1171 | "_defaultOrder": 55, 1172 | "_isFastLaunch": false, 1173 | "category": "Accelerated computing", 1174 | "gpuNum": 8, 1175 | "hideHardwareSpecs": false, 1176 | "memoryGiB": 1152, 1177 | "name": "ml.p4d.24xlarge", 1178 | "vcpuNum": 96 1179 | }, 1180 | { 1181 | "_defaultOrder": 56, 1182 | "_isFastLaunch": false, 1183 | "category": "Accelerated computing", 1184 | "gpuNum": 8, 1185 | "hideHardwareSpecs": false, 1186 | "memoryGiB": 1152, 1187 | "name": "ml.p4de.24xlarge", 1188 | "vcpuNum": 96 1189 | } 1190 | ], 1191 | "instance_type": "ml.t3.medium", 1192 | "kernelspec": { 1193 | "display_name": "Python 3 (Data Science 3.0)", 1194 | "language": "python", 1195 | "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/sagemaker-data-science-310-v1" 1196 | }, 1197 | "language_info": { 1198 | "codemirror_mode": { 1199 | "name": "ipython", 1200 | "version": 3 1201 | }, 1202 | "file_extension": ".py", 1203 | "mimetype": "text/x-python", 1204 | "name": "python", 1205 | "nbconvert_exporter": "python", 1206 | "pygments_lexer": "ipython3", 1207 | "version": "3.10.6" 1208 | } 1209 | }, 1210 | "nbformat": 4, 1211 | "nbformat_minor": 5 1212 | } 1213 | -------------------------------------------------------------------------------- /python/custom-queries/samples/checks-annotations.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/samples/checks-annotations.zip -------------------------------------------------------------------------------- /python/custom-queries/samples/checks-samples.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/samples/checks-samples.zip -------------------------------------------------------------------------------- /python/custom-queries/screenshots/checks-notebook-step1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step1.png -------------------------------------------------------------------------------- /python/custom-queries/screenshots/checks-notebook-step2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step2.png -------------------------------------------------------------------------------- /python/custom-queries/screenshots/checks-notebook-step5_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step5_1.png -------------------------------------------------------------------------------- /python/custom-queries/screenshots/checks-notebook-step6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step6.png -------------------------------------------------------------------------------- /python/custom-queries/screenshots/checks-notebook-step7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step7.png -------------------------------------------------------------------------------- /python/custom-queries/screenshots/checks-notebook-step8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/custom-queries/screenshots/checks-notebook-step8.png -------------------------------------------------------------------------------- /python/employmentapp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/employmentapp.png -------------------------------------------------------------------------------- /python/expense.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/expense.png -------------------------------------------------------------------------------- /python/extraction-parsers/samples/CMS1500-sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/extraction-parsers/samples/CMS1500-sample.png -------------------------------------------------------------------------------- /python/extraction-parsers/samples/ub-04-Form-sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/extraction-parsers/samples/ub-04-Form-sample.png -------------------------------------------------------------------------------- /python/extraction-parsers/ub04-parser.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Parser for UB04 or CMS-1450 Form\n", 8 | "This notebook will walk you through sample code to parse the UB04 or CMS-1450 Form. \n", 9 | "The CMS-1450 form (aka UB-04) is used by institutional providers to bill a Medicare fiscal intermediary when a provider qualifies for a waiver from the Administrative Simplification Compliance Act requirement for electronic submission of claims.\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 43, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "!python -m pip install amazon-textract-caller --upgrade\n", 19 | "!python -m pip install amazon-textract-response-parser --upgrade" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import boto3, json\n", 29 | "from textractcaller.t_call import call_textract, Textract_Features, Query, QueriesConfig, Adapter, AdaptersConfig\n", 30 | "\n", 31 | "import pandas as pd\n", 32 | "import trp\n", 33 | "from trp import Document\n", 34 | "import trp.trp2 as t2\n", 35 | "from trp.trp2 import TDocument, TDocumentSchema, TBlock, TGeometry, TBoundingBox, TPoint\n", 36 | "from trp.t_pipeline import order_blocks_by_geo_x_y\n", 37 | "from textractprettyprinter.t_pretty_print import Pretty_Print_Table_Format, Textract_Pretty_Print, get_forms_string, convert_table_to_kv_dict, convert_table_to_list\n", 38 | "\n", 39 | "\n", 40 | "\n", 41 | "session = boto3.Session(profile_name='kmascar+training-Admin')\n", 42 | "textract = boto3.client('textract')\n", 43 | "\n", 44 | "textract_json = call_textract(input_document=\"samples/ub-04-Form-sample.png\", features = [Textract_Features.FORMS, Textract_Features.TABLES], boto3_textract_client=textract)\n", 45 | "print(json.dumps(textract_json, indent=2))\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Analyzing the UB04 Textract JSON Response: Order of elements\n", 53 | "On Analyzing the structured JSON output, you will notice that the order of the response in not in the reading order. To order this correctly, we will use the `order_blocks_by_geo_x_y` function." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 56, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "|----------------------------------|---------------|\n", 66 | "| Key | Value |\n", 67 | "| 1 | |\n", 68 | "| 2 | |\n", 69 | "| 3a PAT. CNTL # | |\n", 70 | "| 4 TYPE OF BILL | |\n", 71 | "| b. MED. REC. # | |\n", 72 | "| 7 | |\n", 73 | "| 5 FED. TAX NO. | |\n", 74 | "| FROM | |\n", 75 | "| THROUGH | |\n", 76 | "| a | |\n", 77 | "| a | |\n", 78 | "| b | |\n", 79 | "| b | |\n", 80 | "| C | |\n", 81 | "| d | |\n", 82 | "| e | |\n", 83 | "| 29 ACDT STATE | |\n", 84 | "| 30 | |\n", 85 | "| 10 BIRTHDATE | |\n", 86 | "| 11 SEX | |\n", 87 | "| 16 DHR | |\n", 88 | "| 17 STAT | |\n", 89 | "| 12 DATE | |\n", 90 | "| 13 HR | |\n", 91 | "| 14 TYPE | |\n", 92 | "| 15 SRC | |\n", 93 | "| 18 | NOT_SELECTED |\n", 94 | "| 19 | |\n", 95 | "| 20 | |\n", 96 | "| 21 | |\n", 97 | "| 22 | |\n", 98 | "| 23 | |\n", 99 | "| 24 | |\n", 100 | "| 25 | |\n", 101 | "| 26 | |\n", 102 | "| 27 | |\n", 103 | "| 28 | |\n", 104 | "| 31 CODE | 04 05 |\n", 105 | "| OCCURRENCE DATE | 111111 222222 |\n", 106 | "| 32 CODE | 06 07 |\n", 107 | "| OCCURRENCE DATE | 333333 444444 |\n", 108 | "| 33 CODE | |\n", 109 | "| OCCURRENCE DATE | |\n", 110 | "| CODE | |\n", 111 | "| OCCURRENCE DATE | |\n", 112 | "| 35 CODE | |\n", 113 | "| OCCURRENCE FROM | |\n", 114 | "| SPAN THROUGH | |\n", 115 | "| 36 CODE | |\n", 116 | "| OCCURRENCE FROM | |\n", 117 | "| SPAN THROUGH | |\n", 118 | "| 37 | |\n", 119 | "| 38 | |\n", 120 | "| 39 CODE | |\n", 121 | "| VALUE CODES AMOUNT | |\n", 122 | "| CODE | |\n", 123 | "| VALUE CODES AMOUNT | |\n", 124 | "| 41 CODE | |\n", 125 | "| VALUE CODES AMOUNT | |\n", 126 | "| 42 REV. CD. | |\n", 127 | "| 43 DESCRIPTION | |\n", 128 | "| 44 HCPCS RATE HIPPS CODE | |\n", 129 | "| 45 SERV. DATE | |\n", 130 | "| 46 SERV. UNITS | |\n", 131 | "| 47 TOTAL CHARGES | |\n", 132 | "| 48 NON-COVERED CHARGES | |\n", 133 | "| 49 | |\n", 134 | "| PAGE | |\n", 135 | "| OF | |\n", 136 | "| CREATION DATE | |\n", 137 | "| TOTALS | |\n", 138 | "| 50 PAYER NAME | |\n", 139 | "| 51 HEALTH PLAN ID | |\n", 140 | "| 52 REL INFO | |\n", 141 | "| 54 PRIOR PAYMENTS | |\n", 142 | "| 55 EST. AMOUNT DUE | |\n", 143 | "| 56 NPI | |\n", 144 | "| 57 | |\n", 145 | "| OTHER | |\n", 146 | "| PRV ID | |\n", 147 | "| 58 INSURED'S NAME | |\n", 148 | "| 59 P.REL | |\n", 149 | "| 60 INSURED'S UNIQUE ID | |\n", 150 | "| 61 GROUP NAME | |\n", 151 | "| 62 INSURANCE GROUP NO. | |\n", 152 | "| 63 TREATMENT AUTHORIZATION CODES | |\n", 153 | "| 64 DOCUMENT CONTROL NUMBER | |\n", 154 | "| 65 EMPLOYER NAME | |\n", 155 | "| 66 DX | 67 A |\n", 156 | "| 68 | |\n", 157 | "| 69 ADMIT DX | |\n", 158 | "| 70 PATIENT REASON DX | NOT_SELECTED |\n", 159 | "| 71 PPS CODE | |\n", 160 | "| 72 ECI | NOT_SELECTED |\n", 161 | "| 73 | |\n", 162 | "| 75 | |\n", 163 | "| 76 ATTENDING NPI | |\n", 164 | "| QUAL | |\n", 165 | "| CODE | |\n", 166 | "| DATE | |\n", 167 | "| CODE | |\n", 168 | "| DATE | |\n", 169 | "| CODE | |\n", 170 | "| DATE | |\n", 171 | "| LAST | |\n", 172 | "| FIRST | |\n", 173 | "| 77 OPERATING NPI | |\n", 174 | "| QUAL | |\n", 175 | "| CODE | |\n", 176 | "| DATE | |\n", 177 | "| CODE | |\n", 178 | "| DATE | |\n", 179 | "| CODE | |\n", 180 | "| DATE | |\n", 181 | "| LAST | |\n", 182 | "| FIRST | |\n", 183 | "| 80 REMARKS | |\n", 184 | "| 81CC a | |\n", 185 | "| 78 OTHER | |\n", 186 | "| NPI | |\n", 187 | "| QUAL | |\n", 188 | "| b | |\n", 189 | "| LAST | |\n", 190 | "| FIRST | |\n", 191 | "| C | |\n", 192 | "| 79 OTHER | |\n", 193 | "| NPI | |\n", 194 | "| QUAL | |\n", 195 | "| d | |\n", 196 | "| LAST | |\n", 197 | "| FIRST | |\n", 198 | "\n", 199 | "\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "t_doc = TDocumentSchema().load(textract_json)\n", 205 | "ordered_doc = order_blocks_by_geo_x_y(t_doc)\n", 206 | "print(get_forms_string(TDocumentSchema().dump(ordered_doc)))\n", 207 | "\n", 208 | "trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc))\n", 209 | "\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "### Analyzing the UB04 Textract JSON Response: Complexity\n", 217 | "UB04 form is a complex form with many identical key values, making it difficult to differentiate between them. Additionally, `8 PATIENT NAME` and `8 PATIENT ADDRESS` both contain fields `a`, `b`, `c` which we would like to map back to their respective sections.\n", 218 | "\n", 219 | "**Utility Functions:** \n", 220 | "We will now walk through utilitity functions that use our code repositories for Textract GeoFinder, Textract Pretty Printer and Textract Response Parser to parse hierarchical key values that are adjacent to each other or in an area" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "!python -m pip install amazon-textract-geofinder --upgrade\n", 230 | "!python -m pip install amazon-textract-prettyprinter --upgrade" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 57, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "from textractgeofinder.tgeofinder import KeyValue, TGeoFinder, AreaSelection, SelectionElement\n", 240 | "from enum import Enum, auto\n", 241 | "from typing import List\n", 242 | "\n", 243 | "\n", 244 | "def set_hierarchy_kv(list_kv: list[KeyValue], t_document: TDocument, page_block: TBlock, prefix: str = \"DEFAULT\"):\n", 245 | " for x in list_kv:\n", 246 | " # print(f\"{x.key.original_text} updated to {prefix}_{x.key.original_text}\")\n", 247 | " t_document.add_virtual_key_for_existing_key(key_name=f\"{prefix}_{x.key.original_text}\",\n", 248 | " existing_key=t_document.get_block_by_id(x.key.id),\n", 249 | " page_block=page_block)\n", 250 | "\n", 251 | "def set_adjacent_hkv(geofinder_doc: TGeoFinder, t_document: TDocument, phrase: str, number_of_keys:int=1, direction: str = 'RIGHT', prefix = None):\n", 252 | " list_phrase_tword = geofinder_doc.find_phrase_on_page(phrase)\n", 253 | " for phrase_tword in list_phrase_tword:\n", 254 | " # print(phrase_tword)\n", 255 | " if direction == 'RIGHT':\n", 256 | " form_fields = geofinder_doc.get_form_fields_to_the_right(word = phrase_tword, xmax = 1000, number_of_keys = number_of_keys)\n", 257 | " elif direction == 'BELOW':\n", 258 | " form_fields = geofinder_doc.get_form_fields_below(word = phrase_tword, ymax = 1000, number_of_keys = number_of_keys)\n", 259 | " prefix = phrase if prefix is None else prefix\n", 260 | " # print(f\"set_adjacent_hkv, phrasess: {phrase_tword}, form_fields:{form_fields}\")\n", 261 | " set_hierarchy_kv(list_kv=form_fields, t_document=t_document, prefix=prefix, page_block=t_document.pages[0])\n", 262 | "\n", 263 | "class Area_Constraint(Enum):\n", 264 | " WIDTH_PAGE_MIN = auto()\n", 265 | " WIDTH_PAGE_MAX = auto()\n", 266 | " HEIGHT_PAGE_MIN = auto()\n", 267 | " HEIGHT_PAGE_MAX = auto()\n", 268 | " INCLUDE_TOP_LEFT_PHRASE = auto()\n", 269 | " INCLUDE_LOWER_RIGHT_PHRASE = auto()\n", 270 | " \n", 271 | "def set_area_hkv(geofinder_doc: TGeoFinder, t_document: TDocument, top_left_phrase: str, lower_right_phrase: str, area_constraint: List[Area_Constraint]=list(), prefix: str=None):\n", 272 | " top_left_phrase_tword = geofinder_doc.find_phrase_on_page(top_left_phrase)[0]\n", 273 | " lower_right_phrase_tword = geofinder_doc.find_phrase_on_page(lower_right_phrase)[0]\n", 274 | "\n", 275 | " top_left_coord = dict()\n", 276 | " lower_right_coord = dict()\n", 277 | " if area_constraint:\n", 278 | " if Area_Constraint.WIDTH_PAGE_MIN in area_constraint:\n", 279 | " top_left_coord[\"x\"] = 0\n", 280 | " if Area_Constraint.HEIGHT_PAGE_MIN in area_constraint:\n", 281 | " top_left_coord[\"y\"] = 0\n", 282 | " if Area_Constraint.WIDTH_PAGE_MAX in area_constraint:\n", 283 | " lower_right_coord[\"x\"] = geofinder_doc.doc_width\n", 284 | " if Area_Constraint.HEIGHT_PAGE_MAX in area_constraint:\n", 285 | " lower_right_coord[\"y\"] = geofinder_doc.doc_height\n", 286 | " if Area_Constraint.INCLUDE_TOP_LEFT_PHRASE in area_constraint:\n", 287 | " if \"x\" not in top_left_coord:\n", 288 | " top_left_coord[\"x\"] = top_left_phrase_tword.xmin\n", 289 | " if \"y\" not in top_left_coord:\n", 290 | " top_left_coord[\"y\"] = top_left_phrase_tword.ymin\n", 291 | " if Area_Constraint.INCLUDE_LOWER_RIGHT_PHRASE in area_constraint:\n", 292 | " if \"x\" not in lower_right_coord:\n", 293 | " lower_right_coord[\"x\"] = lower_right_phrase_tword.xmax\n", 294 | " if \"y\" not in lower_right_coord:\n", 295 | " lower_right_coord[\"y\"] = lower_right_phrase_tword.ymax\n", 296 | "\n", 297 | " top_left_coord.setdefault(\"x\", top_left_phrase_tword.xmax)\n", 298 | " top_left_coord.setdefault(\"y\", top_left_phrase_tword.ymax)\n", 299 | " lower_right_coord.setdefault(\"x\", lower_right_phrase_tword.xmin)\n", 300 | " lower_right_coord.setdefault(\"y\", lower_right_phrase_tword.ymin)\n", 301 | "\n", 302 | " top_left = TPoint(y=top_left_coord[\"y\"], x=top_left_coord[\"x\"])\n", 303 | " lower_right = TPoint(y=lower_right_coord[\"y\"], x=lower_right_coord[\"x\"])\n", 304 | "\n", 305 | " form_fields = geofinder_doc.get_form_fields_in_area(\n", 306 | " area_selection=AreaSelection(top_left=top_left, lower_right=lower_right, page_number = 1))\n", 307 | " prefix = top_left_phrase if prefix is None else prefix\n", 308 | " # print(f\"set_area_hkv, phrases: {top_left_phrase_tword}, {lower_right_phrase_tword}, form_fields:{form_fields}\")\n", 309 | " set_hierarchy_kv(list_kv=form_fields, t_document=t_document, prefix=prefix, page_block=t_document.pages[0])\n", 310 | "\n", 311 | "def get_cell_with_text(geofinder_doc: TGeoFinder, t_document: TDocument, phrase: str):\n", 312 | " list_phrase_tword = geofinder_doc.find_phrase_on_page(phrase)\n", 313 | " # print(list_phrase_tword)\n", 314 | " for phrase_tword in list_phrase_tword:\n", 315 | " # print(\"calling table cells\")\n", 316 | " table_cells = geofinder_doc.get_cells_with_text(word = phrase_tword, number_of_cells = 1)\n", 317 | " # print(\"column_index:\",t_document.get_block_by_id(table_cells[0].id).column_index)\n", 318 | " \n", 319 | " # geofinder_doc.get_exact_table(id = \"588328c2-0ed5-44d0-b35d-849b90dfb226\")\n", 320 | " return table_cells[0].id\n", 321 | "\n", 322 | "def convert_table_to_key_value(geofinder_doc: TGeoFinder, t_document: TDocument, phrase: str):\n", 323 | " cell_ids = get_cell_with_text(geofinder_doc=geofinder_doc, t_document=t_document, phrase=phrase)\n", 324 | " table_kv_dict = dict()\n", 325 | " trp_doc = trp.Document(TDocumentSchema().dump(t_doc))\n", 326 | " for page in trp_doc.pages:\n", 327 | " for table in page.tables:\n", 328 | " for r, row in enumerate(table.rows):\n", 329 | " for c, cell in enumerate(row.cells):\n", 330 | " if cell.id in cell_ids:\n", 331 | " table_kv_dict = convert_table_to_kv_dict(table, ignore_table_summary=True)\n", 332 | " print(json.dumps(table_kv_dict, indent=2))\n", 333 | " return table_kv_dict\n" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "### Writing an opinionated parser for UB04\n", 341 | "We will now write an opinionate function `parse_ub04` that will use the right utility functions defined above for the respective field and extract the output" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 59, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "def parse_ub04(textract_json):\n", 351 | " t_document = TDocumentSchema().load(textract_json)\n", 352 | " doc_height = 1000\n", 353 | " doc_width = 1000\n", 354 | " geofinder_doc = TGeoFinder(textract_json, doc_height=doc_height, doc_width=doc_width)\n", 355 | "\n", 356 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"3a PAT CNTL\", number_of_keys=1, direction=\"BELOW\", prefix=\"3\")\n", 357 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"6 STATEMENT COVERS PERIOD\", number_of_keys=2, direction=\"BELOW\")\n", 358 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"8 PATIENT NAME\")\n", 359 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"8 PATIENT NAME\", direction=\"BELOW\")\n", 360 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"9 PATIENT ADDRESS\")\n", 361 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"9 PATIENT ADDRESS\", direction=\"BELOW\")\n", 362 | "\n", 363 | " area_constraint = [Area_Constraint.INCLUDE_TOP_LEFT_PHRASE, Area_Constraint.WIDTH_PAGE_MAX]\n", 364 | " set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"9 PATIENT ADDRESS\", lower_right_phrase=\"29 ACDT\", area_constraint=area_constraint)\n", 365 | "\n", 366 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"31 CODE\", prefix=\"31\")\n", 367 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"32 CODE\", prefix=\"32\")\n", 368 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"33 CODE\", prefix=\"33\")\n", 369 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"34 CODE\", prefix=\"34\")\n", 370 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"35 CODE\", number_of_keys=2, prefix=\"35\")\n", 371 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"36 CODE\", number_of_keys=2, prefix=\"36\")\n", 372 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"39 CODE\", prefix=\"39\")\n", 373 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"40 CODE\", prefix=\"40\")\n", 374 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"41 CODE\", prefix=\"41\")\n", 375 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"56 NPI 57\", number_of_keys=2, direction=\"BELOW\", prefix=57)\n", 376 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"74 PRINCIPAL PROCEDURE\", number_of_keys=2, direction=\"BELOW\", prefix=\"74 PRINCIPAL\")\n", 377 | "\n", 378 | " set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"74 PRINCIPAL PROCEDURE\", lower_right_phrase=\"77 OPERATING\", area_constraint=None, prefix=\"74ab\")\n", 379 | "\n", 380 | " area_constraint = [Area_Constraint.WIDTH_PAGE_MIN]\n", 381 | " set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"76 ATTENDING\", lower_right_phrase=\"78 OTHER\", area_constraint=area_constraint, prefix=\"74cde\")\n", 382 | "\n", 383 | " area_constraint = [Area_Constraint.INCLUDE_TOP_LEFT_PHRASE, Area_Constraint.WIDTH_PAGE_MAX]\n", 384 | " set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"76 ATTENDING\", lower_right_phrase=\"77 OPERATING\", area_constraint=area_constraint, prefix=\"76\")\n", 385 | " set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"77 OPERATING\", lower_right_phrase=\"78 OTHER\", area_constraint=area_constraint, prefix=\"77\")\n", 386 | " set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"78 OTHER\", lower_right_phrase=\"79 OTHER\", area_constraint=area_constraint, prefix=\"78\")\n", 387 | "\n", 388 | " area_constraint = [Area_Constraint.INCLUDE_TOP_LEFT_PHRASE, Area_Constraint.WIDTH_PAGE_MAX, Area_Constraint.HEIGHT_PAGE_MAX]\n", 389 | " set_area_hkv(geofinder_doc=geofinder_doc, t_document=t_document, top_left_phrase=\"79 OTHER\", lower_right_phrase=\"LAST\", area_constraint=area_constraint, prefix=\"79\")\n", 390 | "\n", 391 | " set_adjacent_hkv(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"81 CC\", number_of_keys=3, direction=\"BELOW\")\n", 392 | "\n", 393 | "\n", 394 | " convert_table_to_key_value(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"REV CD\")\n", 395 | " convert_table_to_key_value(geofinder_doc=geofinder_doc, t_document=t_document, phrase=\"66 DX\")\n", 396 | "\n", 397 | " return order_blocks_by_geo_x_y(t_document)\n" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "### Calling the post-processing parser\n", 405 | "Let's call the UB04 parser function and analyze the response." 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 60, 411 | "metadata": {}, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "get_cells_with_text: found keys: [TWord(text='42 rev. cd. ', original_text='42 REV. CD. ', text_type='cell', confidence=92.431640625, id='98468515-9121-4f8f-9e28-eacc209de55f', xmin=17, ymin=259, xmax=72, ymax=274, page_number=1, doc_width=1000, doc_height=1000, child_relationships='', reference=None, resolver=None)]\n", 418 | "[\n", 419 | " {\n", 420 | " \"0\": \"\",\n", 421 | " \"42 REV. CD. \": \"\",\n", 422 | " \"43 DESCRIPTION \": \"\",\n", 423 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 424 | " \"45 SERV. DATE \": \"\",\n", 425 | " \"46 SERV. UNITS \": \"\",\n", 426 | " \"47 TOTAL CHARGES \": \"\",\n", 427 | " \"\": \"\",\n", 428 | " \"48 NON-COVERED CHARGES \": \"\",\n", 429 | " \"49 \": \"\"\n", 430 | " },\n", 431 | " {\n", 432 | " \"0\": \"\",\n", 433 | " \"42 REV. CD. \": \"\",\n", 434 | " \"43 DESCRIPTION \": \"\",\n", 435 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 436 | " \"45 SERV. DATE \": \"\",\n", 437 | " \"46 SERV. UNITS \": \"\",\n", 438 | " \"47 TOTAL CHARGES \": \"\",\n", 439 | " \"\": \"\",\n", 440 | " \"48 NON-COVERED CHARGES \": \"\",\n", 441 | " \"49 \": \"\"\n", 442 | " },\n", 443 | " {\n", 444 | " \"0\": \"\",\n", 445 | " \"42 REV. CD. \": \"\",\n", 446 | " \"43 DESCRIPTION \": \"\",\n", 447 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 448 | " \"45 SERV. DATE \": \"\",\n", 449 | " \"46 SERV. UNITS \": \"\",\n", 450 | " \"47 TOTAL CHARGES \": \"\",\n", 451 | " \"\": \"\",\n", 452 | " \"48 NON-COVERED CHARGES \": \"\",\n", 453 | " \"49 \": \"\"\n", 454 | " },\n", 455 | " {\n", 456 | " \"0\": \"\",\n", 457 | " \"42 REV. CD. \": \"\",\n", 458 | " \"43 DESCRIPTION \": \"\",\n", 459 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 460 | " \"45 SERV. DATE \": \"\",\n", 461 | " \"46 SERV. UNITS \": \"\",\n", 462 | " \"47 TOTAL CHARGES \": \"\",\n", 463 | " \"\": \"\",\n", 464 | " \"48 NON-COVERED CHARGES \": \"\",\n", 465 | " \"49 \": \"\"\n", 466 | " },\n", 467 | " {\n", 468 | " \"0\": \"\",\n", 469 | " \"42 REV. CD. \": \"\",\n", 470 | " \"43 DESCRIPTION \": \"\",\n", 471 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 472 | " \"45 SERV. DATE \": \"\",\n", 473 | " \"46 SERV. UNITS \": \"\",\n", 474 | " \"47 TOTAL CHARGES \": \"\",\n", 475 | " \"\": \"\",\n", 476 | " \"48 NON-COVERED CHARGES \": \"\",\n", 477 | " \"49 \": \"\"\n", 478 | " },\n", 479 | " {\n", 480 | " \"0\": \"\",\n", 481 | " \"42 REV. CD. \": \"\",\n", 482 | " \"43 DESCRIPTION \": \"\",\n", 483 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 484 | " \"45 SERV. DATE \": \"\",\n", 485 | " \"46 SERV. UNITS \": \"\",\n", 486 | " \"47 TOTAL CHARGES \": \"\",\n", 487 | " \"\": \"\",\n", 488 | " \"48 NON-COVERED CHARGES \": \"\",\n", 489 | " \"49 \": \"\"\n", 490 | " },\n", 491 | " {\n", 492 | " \"0\": \"\",\n", 493 | " \"42 REV. CD. \": \"\",\n", 494 | " \"43 DESCRIPTION \": \"\",\n", 495 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 496 | " \"45 SERV. DATE \": \"\",\n", 497 | " \"46 SERV. UNITS \": \"\",\n", 498 | " \"47 TOTAL CHARGES \": \"\",\n", 499 | " \"\": \"\",\n", 500 | " \"48 NON-COVERED CHARGES \": \"\",\n", 501 | " \"49 \": \"\"\n", 502 | " },\n", 503 | " {\n", 504 | " \"0\": \"\",\n", 505 | " \"42 REV. CD. \": \"\",\n", 506 | " \"43 DESCRIPTION \": \"\",\n", 507 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 508 | " \"45 SERV. DATE \": \"\",\n", 509 | " \"46 SERV. UNITS \": \"\",\n", 510 | " \"47 TOTAL CHARGES \": \"\",\n", 511 | " \"\": \"\",\n", 512 | " \"48 NON-COVERED CHARGES \": \"\",\n", 513 | " \"49 \": \"\"\n", 514 | " },\n", 515 | " {\n", 516 | " \"0\": \"\",\n", 517 | " \"42 REV. CD. \": \"\",\n", 518 | " \"43 DESCRIPTION \": \"\",\n", 519 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 520 | " \"45 SERV. DATE \": \"\",\n", 521 | " \"46 SERV. UNITS \": \"\",\n", 522 | " \"47 TOTAL CHARGES \": \"\",\n", 523 | " \"\": \"\",\n", 524 | " \"48 NON-COVERED CHARGES \": \"\",\n", 525 | " \"49 \": \"\"\n", 526 | " },\n", 527 | " {\n", 528 | " \"0\": \"\",\n", 529 | " \"42 REV. CD. \": \"\",\n", 530 | " \"43 DESCRIPTION \": \"\",\n", 531 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 532 | " \"45 SERV. DATE \": \"\",\n", 533 | " \"46 SERV. UNITS \": \"\",\n", 534 | " \"47 TOTAL CHARGES \": \"\",\n", 535 | " \"\": \"\",\n", 536 | " \"48 NON-COVERED CHARGES \": \"\",\n", 537 | " \"49 \": \"\"\n", 538 | " },\n", 539 | " {\n", 540 | " \"0\": \"\",\n", 541 | " \"42 REV. CD. \": \"\",\n", 542 | " \"43 DESCRIPTION \": \"\",\n", 543 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 544 | " \"45 SERV. DATE \": \"\",\n", 545 | " \"46 SERV. UNITS \": \"\",\n", 546 | " \"47 TOTAL CHARGES \": \"\",\n", 547 | " \"\": \"\",\n", 548 | " \"48 NON-COVERED CHARGES \": \"\",\n", 549 | " \"49 \": \"\"\n", 550 | " },\n", 551 | " {\n", 552 | " \"0\": \"\",\n", 553 | " \"42 REV. CD. \": \"\",\n", 554 | " \"43 DESCRIPTION \": \"\",\n", 555 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 556 | " \"45 SERV. DATE \": \"\",\n", 557 | " \"46 SERV. UNITS \": \"\",\n", 558 | " \"47 TOTAL CHARGES \": \"\",\n", 559 | " \"\": \"\",\n", 560 | " \"48 NON-COVERED CHARGES \": \"\",\n", 561 | " \"49 \": \"\"\n", 562 | " },\n", 563 | " {\n", 564 | " \"0\": \"\",\n", 565 | " \"42 REV. CD. \": \"\",\n", 566 | " \"43 DESCRIPTION \": \"\",\n", 567 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 568 | " \"45 SERV. DATE \": \"\",\n", 569 | " \"46 SERV. UNITS \": \"\",\n", 570 | " \"47 TOTAL CHARGES \": \"\",\n", 571 | " \"\": \"\",\n", 572 | " \"48 NON-COVERED CHARGES \": \"\",\n", 573 | " \"49 \": \"\"\n", 574 | " },\n", 575 | " {\n", 576 | " \"0\": \"\",\n", 577 | " \"42 REV. CD. \": \"\",\n", 578 | " \"43 DESCRIPTION \": \"\",\n", 579 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 580 | " \"45 SERV. DATE \": \"\",\n", 581 | " \"46 SERV. UNITS \": \"\",\n", 582 | " \"47 TOTAL CHARGES \": \"\",\n", 583 | " \"\": \"\",\n", 584 | " \"48 NON-COVERED CHARGES \": \"\",\n", 585 | " \"49 \": \"\"\n", 586 | " },\n", 587 | " {\n", 588 | " \"0\": \"\",\n", 589 | " \"42 REV. CD. \": \"\",\n", 590 | " \"43 DESCRIPTION \": \"\",\n", 591 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 592 | " \"45 SERV. DATE \": \"\",\n", 593 | " \"46 SERV. UNITS \": \"\",\n", 594 | " \"47 TOTAL CHARGES \": \"\",\n", 595 | " \"\": \"\",\n", 596 | " \"48 NON-COVERED CHARGES \": \"\",\n", 597 | " \"49 \": \"\"\n", 598 | " },\n", 599 | " {\n", 600 | " \"0\": \"\",\n", 601 | " \"42 REV. CD. \": \"\",\n", 602 | " \"43 DESCRIPTION \": \"\",\n", 603 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 604 | " \"45 SERV. DATE \": \"\",\n", 605 | " \"46 SERV. UNITS \": \"\",\n", 606 | " \"47 TOTAL CHARGES \": \"\",\n", 607 | " \"\": \"\",\n", 608 | " \"48 NON-COVERED CHARGES \": \"\",\n", 609 | " \"49 \": \"\"\n", 610 | " },\n", 611 | " {\n", 612 | " \"0\": \"\",\n", 613 | " \"42 REV. CD. \": \"\",\n", 614 | " \"43 DESCRIPTION \": \"\",\n", 615 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 616 | " \"45 SERV. DATE \": \"\",\n", 617 | " \"46 SERV. UNITS \": \"\",\n", 618 | " \"47 TOTAL CHARGES \": \"\",\n", 619 | " \"\": \"\",\n", 620 | " \"48 NON-COVERED CHARGES \": \"\",\n", 621 | " \"49 \": \"\"\n", 622 | " },\n", 623 | " {\n", 624 | " \"0\": \"\",\n", 625 | " \"42 REV. CD. \": \"\",\n", 626 | " \"43 DESCRIPTION \": \"\",\n", 627 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 628 | " \"45 SERV. DATE \": \"\",\n", 629 | " \"46 SERV. UNITS \": \"\",\n", 630 | " \"47 TOTAL CHARGES \": \"\",\n", 631 | " \"\": \"\",\n", 632 | " \"48 NON-COVERED CHARGES \": \"\",\n", 633 | " \"49 \": \"\"\n", 634 | " },\n", 635 | " {\n", 636 | " \"0\": \"\",\n", 637 | " \"42 REV. CD. \": \"\",\n", 638 | " \"43 DESCRIPTION \": \"\",\n", 639 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 640 | " \"45 SERV. DATE \": \"\",\n", 641 | " \"46 SERV. UNITS \": \"\",\n", 642 | " \"47 TOTAL CHARGES \": \"\",\n", 643 | " \"\": \"\",\n", 644 | " \"48 NON-COVERED CHARGES \": \"\",\n", 645 | " \"49 \": \"\"\n", 646 | " },\n", 647 | " {\n", 648 | " \"0\": \"\",\n", 649 | " \"42 REV. CD. \": \"\",\n", 650 | " \"43 DESCRIPTION \": \"\",\n", 651 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 652 | " \"45 SERV. DATE \": \"\",\n", 653 | " \"46 SERV. UNITS \": \"\",\n", 654 | " \"47 TOTAL CHARGES \": \"\",\n", 655 | " \"\": \"\",\n", 656 | " \"48 NON-COVERED CHARGES \": \"\",\n", 657 | " \"49 \": \"\"\n", 658 | " },\n", 659 | " {\n", 660 | " \"0\": \"\",\n", 661 | " \"42 REV. CD. \": \"\",\n", 662 | " \"43 DESCRIPTION \": \"\",\n", 663 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 664 | " \"45 SERV. DATE \": \"\",\n", 665 | " \"46 SERV. UNITS \": \"\",\n", 666 | " \"47 TOTAL CHARGES \": \"\",\n", 667 | " \"\": \"\",\n", 668 | " \"48 NON-COVERED CHARGES \": \"\",\n", 669 | " \"49 \": \"\"\n", 670 | " },\n", 671 | " {\n", 672 | " \"0\": \"\",\n", 673 | " \"42 REV. CD. \": \"\",\n", 674 | " \"43 DESCRIPTION \": \"\",\n", 675 | " \"44 HCPCS RATE HIPPS CODE \": \"\",\n", 676 | " \"45 SERV. DATE \": \"\",\n", 677 | " \"46 SERV. UNITS \": \"\",\n", 678 | " \"47 TOTAL CHARGES \": \"\",\n", 679 | " \"\": \"\",\n", 680 | " \"48 NON-COVERED CHARGES \": \"\",\n", 681 | " \"49 \": \"\"\n", 682 | " }\n", 683 | "]\n", 684 | "get_cells_with_text: found keys: [TWord(text='66 dx ', original_text='66 DX ', text_type='cell', confidence=77.44140625, id='79ca5a64-1891-4a04-80f4-4ba1f48cd1e2', xmin=17, ymin=804, xmax=30, ymax=819, page_number=1, doc_width=1000, doc_height=1000, child_relationships='', reference=None, resolver=None)]\n", 685 | "[\n", 686 | " {\n", 687 | " \"0\": \"66 DX \",\n", 688 | " \"1\": \"67 \",\n", 689 | " \"2\": \"\",\n", 690 | " \"3\": \"A \",\n", 691 | " \"4\": \"\",\n", 692 | " \"5\": \"\",\n", 693 | " \"6\": \"\",\n", 694 | " \"7\": \"NOT_SELECTED, \",\n", 695 | " \"8\": \"\",\n", 696 | " \"9\": \"\",\n", 697 | " \"10\": \"\",\n", 698 | " \"11\": \"\",\n", 699 | " \"12\": \"\",\n", 700 | " \"13\": \"\",\n", 701 | " \"14\": \"\",\n", 702 | " \"15\": \"NOT_SELECTED, \",\n", 703 | " \"16\": \"\",\n", 704 | " \"17\": \"NOT_SELECTED, \",\n", 705 | " \"18\": \"\",\n", 706 | " \"19\": \"68 \"\n", 707 | " },\n", 708 | " {\n", 709 | " \"0\": \"\",\n", 710 | " \"1\": \"\",\n", 711 | " \"2\": \"\",\n", 712 | " \"3\": \"\",\n", 713 | " \"4\": \"\",\n", 714 | " \"5\": \"\",\n", 715 | " \"6\": \"\",\n", 716 | " \"7\": \"\",\n", 717 | " \"8\": \"\",\n", 718 | " \"9\": \"\",\n", 719 | " \"10\": \"\",\n", 720 | " \"11\": \"NOT_SELECTED, \",\n", 721 | " \"12\": \"\",\n", 722 | " \"13\": \"NOT_SELECTED, \",\n", 723 | " \"14\": \"\",\n", 724 | " \"15\": \"\",\n", 725 | " \"16\": \"\",\n", 726 | " \"17\": \"NOT_SELECTED, \",\n", 727 | " \"18\": \"\",\n", 728 | " \"19\": \"\"\n", 729 | " }\n", 730 | "]\n", 731 | "|-----------------------------------|---------------|\n", 732 | "| Key | Value |\n", 733 | "| 9 PATIENT ADDRESS_a | |\n", 734 | "| 76_FIRST | |\n", 735 | "| 81 CC_C | |\n", 736 | "| 77_FIRST | |\n", 737 | "| 36_OCCURRENCE FROM | |\n", 738 | "| 78_78 OTHER | |\n", 739 | "| 74cde_DATE | |\n", 740 | "| 74 PRINCIPAL_DATE | |\n", 741 | "| 76_LAST | |\n", 742 | "| 9 PATIENT ADDRESS_a | |\n", 743 | "| 78_QUAL | |\n", 744 | "| 57_PRV ID | |\n", 745 | "| 35_SPAN THROUGH | |\n", 746 | "| 74ab_DATE | |\n", 747 | "| 9 PATIENT ADDRESS_b | |\n", 748 | "| 31_32 CODE | 06 07 |\n", 749 | "| 79_FIRST | |\n", 750 | "| 77_77 OPERATING NPI | |\n", 751 | "| 77_QUAL | |\n", 752 | "| 78_FIRST | |\n", 753 | "| 57_OTHER | |\n", 754 | "| 74 PRINCIPAL_CODE | |\n", 755 | "| 8 PATIENT NAME_b | |\n", 756 | "| 39_VALUE CODES AMOUNT | |\n", 757 | "| 81 CC_b | |\n", 758 | "| 74cde_DATE | |\n", 759 | "| 9 PATIENT ADDRESS_b | |\n", 760 | "| 41_VALUE CODES AMOUNT | |\n", 761 | "| 79_NPI | |\n", 762 | "| 79_LAST | |\n", 763 | "| 77_LAST | |\n", 764 | "| 78_LAST | |\n", 765 | "| 9 PATIENT ADDRESS_e | |\n", 766 | "| 74ab_CODE | |\n", 767 | "| 74cde_CODE | |\n", 768 | "| 36_SPAN THROUGH | |\n", 769 | "| 33_OCCURRENCE DATE | |\n", 770 | "| 34_OCCURRENCE DATE | |\n", 771 | "| 3_b. MED. REC. # | |\n", 772 | "| 76_QUAL | |\n", 773 | "| 76_76 ATTENDING NPI | |\n", 774 | "| 79_QUAL | |\n", 775 | "| 8 PATIENT NAME_a | |\n", 776 | "| 74ab_CODE | |\n", 777 | "| 40_VALUE CODES AMOUNT | |\n", 778 | "| 79_79 OTHER | |\n", 779 | "| 74cde_DATE | |\n", 780 | "| 35_OCCURRENCE FROM | |\n", 781 | "| 74cde_CODE | |\n", 782 | "| 6 STATEMENT COVERS PERIOD_FROM | |\n", 783 | "| 9 PATIENT ADDRESS_d | |\n", 784 | "| 81 CC_d | |\n", 785 | "| 6 STATEMENT COVERS PERIOD_THROUGH | |\n", 786 | "| 74cde_CODE | |\n", 787 | "| 78_NPI | |\n", 788 | "| 9 PATIENT ADDRESS_C | |\n", 789 | "| 74ab_DATE | |\n", 790 | "| 32_OCCURRENCE DATE | 333333 444444 |\n", 791 | "| 1 | |\n", 792 | "| 2 | |\n", 793 | "| 3a PAT. CNTL # | |\n", 794 | "| 4 TYPE OF BILL | |\n", 795 | "| b. MED. REC. # | |\n", 796 | "| 7 | |\n", 797 | "| 5 FED. TAX NO. | |\n", 798 | "| FROM | |\n", 799 | "| THROUGH | |\n", 800 | "| a | |\n", 801 | "| a | |\n", 802 | "| b | |\n", 803 | "| b | |\n", 804 | "| C | |\n", 805 | "| d | |\n", 806 | "| e | |\n", 807 | "| 29 ACDT STATE | |\n", 808 | "| 30 | |\n", 809 | "| 10 BIRTHDATE | |\n", 810 | "| 11 SEX | |\n", 811 | "| 16 DHR | |\n", 812 | "| 17 STAT | |\n", 813 | "| 12 DATE | |\n", 814 | "| 13 HR | |\n", 815 | "| 14 TYPE | |\n", 816 | "| 15 SRC | |\n", 817 | "| 18 | NOT_SELECTED |\n", 818 | "| 19 | |\n", 819 | "| 20 | |\n", 820 | "| 21 | |\n", 821 | "| 22 | |\n", 822 | "| 23 | |\n", 823 | "| 24 | |\n", 824 | "| 25 | |\n", 825 | "| 26 | |\n", 826 | "| 27 | |\n", 827 | "| 28 | |\n", 828 | "| 31 CODE | 04 05 |\n", 829 | "| OCCURRENCE DATE | 111111 222222 |\n", 830 | "| 32 CODE | 06 07 |\n", 831 | "| OCCURRENCE DATE | 333333 444444 |\n", 832 | "| 33 CODE | |\n", 833 | "| OCCURRENCE DATE | |\n", 834 | "| CODE | |\n", 835 | "| OCCURRENCE DATE | |\n", 836 | "| 35 CODE | |\n", 837 | "| OCCURRENCE FROM | |\n", 838 | "| SPAN THROUGH | |\n", 839 | "| 36 CODE | |\n", 840 | "| OCCURRENCE FROM | |\n", 841 | "| SPAN THROUGH | |\n", 842 | "| 37 | |\n", 843 | "| 38 | |\n", 844 | "| 39 CODE | |\n", 845 | "| VALUE CODES AMOUNT | |\n", 846 | "| CODE | |\n", 847 | "| VALUE CODES AMOUNT | |\n", 848 | "| 41 CODE | |\n", 849 | "| VALUE CODES AMOUNT | |\n", 850 | "| 42 REV. CD. | |\n", 851 | "| 43 DESCRIPTION | |\n", 852 | "| 44 HCPCS RATE HIPPS CODE | |\n", 853 | "| 45 SERV. DATE | |\n", 854 | "| 46 SERV. UNITS | |\n", 855 | "| 47 TOTAL CHARGES | |\n", 856 | "| 48 NON-COVERED CHARGES | |\n", 857 | "| 49 | |\n", 858 | "| PAGE | |\n", 859 | "| OF | |\n", 860 | "| CREATION DATE | |\n", 861 | "| TOTALS | |\n", 862 | "| 50 PAYER NAME | |\n", 863 | "| 51 HEALTH PLAN ID | |\n", 864 | "| 52 REL INFO | |\n", 865 | "| 54 PRIOR PAYMENTS | |\n", 866 | "| 55 EST. AMOUNT DUE | |\n", 867 | "| 56 NPI | |\n", 868 | "| 57 | |\n", 869 | "| OTHER | |\n", 870 | "| PRV ID | |\n", 871 | "| 58 INSURED'S NAME | |\n", 872 | "| 59 P.REL | |\n", 873 | "| 60 INSURED'S UNIQUE ID | |\n", 874 | "| 61 GROUP NAME | |\n", 875 | "| 62 INSURANCE GROUP NO. | |\n", 876 | "| 63 TREATMENT AUTHORIZATION CODES | |\n", 877 | "| 64 DOCUMENT CONTROL NUMBER | |\n", 878 | "| 65 EMPLOYER NAME | |\n", 879 | "| 66 DX | 67 A |\n", 880 | "| 68 | |\n", 881 | "| 69 ADMIT DX | |\n", 882 | "| 70 PATIENT REASON DX | NOT_SELECTED |\n", 883 | "| 71 PPS CODE | |\n", 884 | "| 72 ECI | NOT_SELECTED |\n", 885 | "| 73 | |\n", 886 | "| 75 | |\n", 887 | "| 76 ATTENDING NPI | |\n", 888 | "| QUAL | |\n", 889 | "| CODE | |\n", 890 | "| DATE | |\n", 891 | "| CODE | |\n", 892 | "| DATE | |\n", 893 | "| CODE | |\n", 894 | "| DATE | |\n", 895 | "| LAST | |\n", 896 | "| FIRST | |\n", 897 | "| 77 OPERATING NPI | |\n", 898 | "| QUAL | |\n", 899 | "| CODE | |\n", 900 | "| DATE | |\n", 901 | "| CODE | |\n", 902 | "| DATE | |\n", 903 | "| CODE | |\n", 904 | "| DATE | |\n", 905 | "| LAST | |\n", 906 | "| FIRST | |\n", 907 | "| 80 REMARKS | |\n", 908 | "| 81CC a | |\n", 909 | "| 78 OTHER | |\n", 910 | "| NPI | |\n", 911 | "| QUAL | |\n", 912 | "| b | |\n", 913 | "| LAST | |\n", 914 | "| FIRST | |\n", 915 | "| C | |\n", 916 | "| 79 OTHER | |\n", 917 | "| NPI | |\n", 918 | "| QUAL | |\n", 919 | "| d | |\n", 920 | "| LAST | |\n", 921 | "| FIRST | |\n", 922 | "\n", 923 | "\n" 924 | ] 925 | } 926 | ], 927 | "source": [ 928 | "t_doc = TDocumentSchema().load(textract_json)\n", 929 | "ordered_doc = order_blocks_by_geo_x_y(t_doc)\n", 930 | "trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc))\n", 931 | "\n", 932 | "final_t_document = parse_ub04(textract_json)\n", 933 | "\n", 934 | "print(get_forms_string(TDocumentSchema().dump(final_t_document)))\n" 935 | ] 936 | } 937 | ], 938 | "metadata": { 939 | "kernelspec": { 940 | "display_name": "trp-test", 941 | "language": "python", 942 | "name": "python3" 943 | }, 944 | "language_info": { 945 | "codemirror_mode": { 946 | "name": "ipython", 947 | "version": 3 948 | }, 949 | "file_extension": ".py", 950 | "mimetype": "text/x-python", 951 | "name": "python", 952 | "nbconvert_exporter": "python", 953 | "pygments_lexer": "ipython3", 954 | "version": "3.9.6" 955 | } 956 | }, 957 | "nbformat": 4, 958 | "nbformat_minor": 4 959 | } 960 | -------------------------------------------------------------------------------- /python/medical-notes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/medical-notes.png -------------------------------------------------------------------------------- /python/patient_intake_form_sample.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/patient_intake_form_sample.jpg -------------------------------------------------------------------------------- /python/queries/insurance-card.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/queries/insurance-card.png -------------------------------------------------------------------------------- /python/queries/mortgage-note.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/queries/mortgage-note.jpg -------------------------------------------------------------------------------- /python/queries/paystub-questions_full.csv: -------------------------------------------------------------------------------- 1 | What is the Pay Period Start Date?,PAYSTUB_START_DATE 2 | What is the Pay Period End Date?,PAYSTUB_END_DATE 3 | What is the Pay Date?,PAYSTUB_PAY_DATE 4 | What is the Employee Name?,PAYSTUB_EMPLOYEE_NAME 5 | What is the Employee Address?,PAYSTUB_EMPLOYEE_ADDRESS 6 | What is the Company Name?,PAYSTUB_EMPLOYER_NAME 7 | What is the Company Address?,PAYSTUB_EMPLOYER_ADDRESS 8 | What is the Federal Filing Status?,PAYSTUB_FEDERAL_FILING 9 | What is the State Filing Status?,PAYSTUB_STATE_FILING 10 | What is the Current Gross Pay?,PAYSTUB_CURRENT_GROSS 11 | What is the YTD Gross Pay?,PAYSTUB_YTD_GROSS 12 | What is the Current Net Pay?,PAYSTUB_CURRENT_NET 13 | What is the YTD Net Pay?,PAYSTUB_YTD_NET 14 | What are the warnings?,PAYSTUB_WARNINGS 15 | What are the Messages?,PAYSTUB_MESSAGES 16 | What are the Notes?,PAYSTUB_NOTES 17 | What are the Contact?,PAYSTUB_CONTACT 18 | what is the regular hourly rate?,PAYSTUB_REGULAR_HOURS_RATE 19 | what is the holiday hourly rate?,PAYSTUB_HOLIDAY_HOURS_RATE 20 | What is the YTD - Child Support?,PAYSTUB_YTD_CHILD_SUPPORT 21 | What is the YTD - Garnishments?,PAYSTUB_YTD_GARNISHMENT 22 | What is the current - Child Support?,PAYSTUB_CURRENT_CHILD_SUPPORT 23 | What is the current - Garnishments?,PAYSTUB_CURRENT_GARNISHMENT 24 | What is the current regular pay?,PAYSTUB_REGULAR_PAY 25 | What is the YTD regular pay?,PAYSTUB_YTD_PAY 26 | -------------------------------------------------------------------------------- /python/queries/paystub-questions_subset.csv: -------------------------------------------------------------------------------- 1 | What is the Pay Period Start Date?,PAYSTUB_START_DATE 2 | What is the Pay Period End Date?,PAYSTUB_END_DATE 3 | What is the Pay Date?,PAYSTUB_PAY_DATE 4 | What is the Employee Name?,PAYSTUB_EMPLOYEE_NAME 5 | What is the Employee Address?,PAYSTUB_EMPLOYEE_ADDRESS 6 | What is the Company Name?,PAYSTUB_EMPLOYER_NAME 7 | What is the Company Address?,PAYSTUB_EMPLOYER_ADDRESS 8 | What is the Federal Filing Status?,PAYSTUB_FEDERAL_FILING 9 | What is the State Filing Status?,PAYSTUB_STATE_FILING 10 | What is the Current Gross Pay?,PAYSTUB_CURRENT_GROSS 11 | What is the YTD Gross Pay?,PAYSTUB_YTD_GROSS 12 | What is the Current Net Pay?,PAYSTUB_CURRENT_NET 13 | What is the YTD Net Pay?,PAYSTUB_YTD_NET 14 | What are the warnings?,PAYSTUB_WARNINGS 15 | What are the Messages?,PAYSTUB_MESSAGES 16 | -------------------------------------------------------------------------------- /python/queries/paystub.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/queries/paystub.jpg -------------------------------------------------------------------------------- /python/queries/vaccination.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/queries/vaccination.png -------------------------------------------------------------------------------- /python/simple-document-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/simple-document-image.jpg -------------------------------------------------------------------------------- /python/two-column-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/two-column-image.jpg -------------------------------------------------------------------------------- /python/verification-of-employment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/python/verification-of-employment.png -------------------------------------------------------------------------------- /src-csharp/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | bin 3 | obj 4 | 5 | -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/DetectTextHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Dotnet_Core.Services; 3 | 4 | namespace Dotnet_Core.ArgHandlers { 5 | internal class DetectTextHandler { 6 | private readonly TextractTextDetectionService textractTextService; 7 | 8 | public DetectTextHandler(TextractTextDetectionService textractTextService) { 9 | this.textractTextService = textractTextService; 10 | } 11 | 12 | internal void Handle(string localFile) { 13 | var localTask = textractTextService.DetectTextLocal(localFile); 14 | localTask.Wait(); 15 | textractTextService.Print(localTask.Result); 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/DetectTextS3Handler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Dotnet_Core.Services; 3 | 4 | namespace Dotnet_Core.ArgHandlers { 5 | internal class DetectTextS3Handler { 6 | private readonly TextractTextDetectionService textractTextService; 7 | 8 | public DetectTextS3Handler(TextractTextDetectionService textractTextService) { 9 | this.textractTextService = textractTextService; 10 | } 11 | 12 | internal void Handle(string bucketName, string s3File) { 13 | var s3Task = textractTextService.DetectTextS3(bucketName, s3File); 14 | s3Task.Wait(); 15 | textractTextService.Print(s3Task.Result); 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/FormsHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Amazon.Textract.Model; 3 | using Dotnet_Core.Services; 4 | 5 | namespace Dotnet_Core.ArgHandlers { 6 | internal class FormsHandler { 7 | private readonly TextractTextAnalysisService textractAnalysisService; 8 | public FormsHandler(TextractTextAnalysisService textractAnalysisService) { 9 | this.textractAnalysisService = textractAnalysisService; 10 | } 11 | 12 | internal void Handle(string bucketName, string formFile) { 13 | var task = textractAnalysisService.StartDocumentAnalysis(bucketName, formFile, "FORMS"); 14 | var jobId = task.Result; 15 | textractAnalysisService.WaitForJobCompletion(jobId); 16 | var results = textractAnalysisService.GetJobResults(jobId); 17 | var document = new TextractDocument(results); 18 | document.Pages.ForEach(page => { 19 | page.Form.Fields.ForEach(f => { 20 | Console.WriteLine("Key: {0}, Value {1}", f.Key, f.Value); 21 | }); 22 | Console.WriteLine("Get Field by Key:"); 23 | var key = "Phone Number:"; 24 | var field = page.Form.GetFieldByKey(key); 25 | if(field != null) { 26 | Console.WriteLine("Key: {0}, Value: {1}", field.Key, field.Value); 27 | } 28 | }); 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/FormsRedactionHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Drawing; 3 | using System.IO; 4 | using Amazon.Textract.Model; 5 | using Dotnet_Core.Services; 6 | 7 | namespace Dotnet_Core.ArgHandlers { 8 | internal class FormsRedactionHandler { 9 | private readonly TextractTextAnalysisService textractAnalysisService; 10 | 11 | public FormsRedactionHandler(TextractTextAnalysisService textractAnalysisService) { 12 | this.textractAnalysisService = textractAnalysisService; 13 | } 14 | 15 | internal void Handle(string bucketName, string formFile, string localFolder, string localFile) { 16 | var task = textractAnalysisService.StartDocumentAnalysis(bucketName, formFile, "FORMS"); 17 | var jobId = task.Result; 18 | textractAnalysisService.WaitForJobCompletion(jobId); 19 | var results = textractAnalysisService.GetJobResults(jobId); 20 | 21 | var redactableImage = Path.Join(localFolder, "redacted-" + formFile); 22 | if(File.Exists(redactableImage)) 23 | File.Delete(redactableImage); 24 | File.Copy(localFile, redactableImage); 25 | var image = Image.FromFile(redactableImage); 26 | var graphics = Graphics.FromImage(image); 27 | var height = image.Height; 28 | var width = image.Width; 29 | Console.WriteLine("image dimensions: {0}x{1}", width, height); 30 | 31 | var document = new TextractDocument(results); 32 | document.Pages.ForEach(page => { 33 | page.Form.Fields.ForEach(field => { 34 | if(field.Key.Text.ToLower().Contains("address")) { 35 | Console.WriteLine("Redacting Key: {0}, Value: {1}", field.Key.Text, field.Value.Text); 36 | var bb = field.Value.Geometry.BoundingBox; 37 | Console.WriteLine(bb); 38 | var x1 = bb.Left * width; 39 | var y1 = bb.Top * height - 2; 40 | var x2 = bb.Width * width + 2; 41 | var y2 = bb.Height * height + 2; 42 | 43 | Console.WriteLine("x1: {0}, x2: {1}, y1: {2}, y2: {3}", x1, x2, y1, y2); 44 | graphics.FillRectangle(new SolidBrush(Color.Black), x1, y1, x2, y2); 45 | graphics.Save(); 46 | image.Save(redactableImage); 47 | Console.WriteLine("redacted image saved at: {0}", redactableImage); 48 | } 49 | }); 50 | }); 51 | } 52 | } 53 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/NlpComprehendHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Dotnet_Core.Services; 3 | 4 | namespace Dotnet_Core.ArgHandlers { 5 | internal class NlpComprehendHandler { 6 | private readonly TextractTextDetectionService textractTextService; 7 | private readonly ComprehendService comprehendService; 8 | 9 | public NlpComprehendHandler(TextractTextDetectionService textractTextService, ComprehendService comprehendService) { 10 | this.textractTextService = textractTextService; 11 | this.comprehendService = comprehendService; 12 | } 13 | 14 | internal void Handle(string localFile) { 15 | var localTask = textractTextService.DetectTextLocal(localFile); 16 | localTask.Wait(); 17 | var result = localTask.Result; 18 | var lineItems = textractTextService.GetLines(result); 19 | var detectSentimentTask = comprehendService.DetectSentiment("en", string.Join("", lineItems)); 20 | detectSentimentTask.Wait(); 21 | Console.WriteLine(detectSentimentTask.Result); 22 | var detectEntitiesTask = comprehendService.DetectEntities("en", string.Join("", lineItems)); 23 | detectEntitiesTask.Wait(); 24 | detectEntitiesTask.Result.ForEach(entity => { 25 | Console.WriteLine("{0}:{1}:{2}", entity.Text, entity.Score, entity.Type); 26 | }); 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/NlpComprehendMedicalHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Dotnet_Core.Services; 3 | 4 | namespace Dotnet_Core.ArgHandlers { 5 | internal class NlpComprehendMedicalHandler { 6 | private readonly TextractTextDetectionService textractTextService; 7 | private readonly ComprehendService comprehendMedicalService; 8 | 9 | public NlpComprehendMedicalHandler(TextractTextDetectionService textractTextService, ComprehendService comprehendMedicalService) { 10 | this.textractTextService = textractTextService; 11 | this.comprehendMedicalService = comprehendMedicalService; 12 | } 13 | 14 | internal void Handle(string medicalFile) { 15 | var localTask = textractTextService.DetectTextLocal(medicalFile); 16 | localTask.Wait(); 17 | var result = localTask.Result; 18 | var lineItems = textractTextService.GetLines(result); 19 | var medicalTask = comprehendMedicalService.DetectEntities(string.Join("", lineItems)); 20 | medicalTask.Wait(); 21 | medicalTask.Result.ForEach(entity => { 22 | Console.WriteLine("Text: [{0}], Type: [{1}], Category: [{2}]", entity.Text, entity.Type, entity.Category); 23 | entity.Traits.ForEach(trait => { 24 | Console.WriteLine(" Trait: [{0}], Score: [{1}]", trait.Name, trait.Score); 25 | }); 26 | }); 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/PdfTextHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Dotnet_Core.Services; 3 | 4 | namespace Dotnet_Core.ArgHandlers { 5 | internal class PdfTextHandler { 6 | private readonly TextractTextDetectionService textractTextService; 7 | 8 | public PdfTextHandler(TextractTextDetectionService textractTextService) { 9 | this.textractTextService = textractTextService; 10 | } 11 | 12 | internal void Handle(string bucketName, string pdfFile) { 13 | var task = textractTextService.StartDocumentTextDetection(bucketName, pdfFile); 14 | var jobId = task.Result; 15 | textractTextService.WaitForJobCompletion(jobId); 16 | textractTextService.Print(textractTextService.GetJobResults(jobId)); 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/ReadingOrderHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using Dotnet_Core.Services; 4 | 5 | namespace Dotnet_Core.ArgHandlers { 6 | internal class ReadingOrderHandler { 7 | private readonly TextractTextDetectionService textractTextService; 8 | 9 | public ReadingOrderHandler(TextractTextDetectionService textractTextService) { 10 | this.textractTextService = textractTextService; 11 | } 12 | 13 | internal void Handle(string bucketName, string twoColumnImage) { 14 | var task = textractTextService.StartDocumentTextDetection(bucketName, twoColumnImage); 15 | var jobId = task.Result; 16 | textractTextService.WaitForJobCompletion(jobId); 17 | var jobResults = textractTextService.GetJobResults(jobId); 18 | var lines = new List(); 19 | var columns = new List(); 20 | jobResults.ForEach(job => { 21 | job.Blocks.ForEach(block => { 22 | if(block.BlockType == "LINE") { 23 | var columnFound = false; 24 | for(var index = 0; index < columns.Count; index++) { 25 | var column = columns[index]; 26 | var bb = block.Geometry.BoundingBox; 27 | var bbLeft = bb.Left; 28 | var bbRight = bb.Left + bb.Width; 29 | var bbCentre = bb.Left + (bb.Width / 2); 30 | var columnCentre = column.Left + (column.Right / 2); 31 | 32 | if((bbCentre > column.Left && bbCentre < column.Right) || (columnCentre > bbLeft && columnCentre < bbRight)) { 33 | lines.Add(new IndexedText { ColumnIndex = index, Text = block.Text }); 34 | columnFound = true; 35 | break; 36 | } 37 | } 38 | if(!columnFound) { 39 | var bb = block.Geometry.BoundingBox; 40 | columns.Add(new Column { Left = bb.Left, Right = bb.Left + bb.Width }); 41 | lines.Add(new IndexedText { ColumnIndex = columns.Count - 1, Text = block.Text }); 42 | } 43 | } 44 | }); 45 | lines.FindAll(line => line.ColumnIndex == 0).ForEach(line => Console.WriteLine(line)); 46 | }); 47 | } 48 | } 49 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/SearchHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Amazon.Textract.Model; 3 | using Dotnet_Core.Services; 4 | 5 | namespace Dotnet_Core.ArgHandlers { 6 | internal class SearchHandler { 7 | private readonly TextractTextDetectionService textractTextService; 8 | private readonly ElasticSearchService elasticSearchService; 9 | 10 | public SearchHandler(TextractTextDetectionService textractTextService, ElasticSearchService elasticSearchService) { 11 | this.textractTextService = textractTextService; 12 | this.elasticSearchService = elasticSearchService; 13 | } 14 | 15 | internal void Handle(string bucketName, string s3File) { 16 | var detectTextTask = textractTextService.DetectTextS3(bucketName, s3File); 17 | detectTextTask.Wait(); 18 | var result = detectTextTask.Result; 19 | textractTextService.Print(result); 20 | elasticSearchService.Index(result, "sample-index"); 21 | Console.WriteLine("Index complete"); 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/TablesExpenseHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | using Amazon.Textract.Model; 5 | using Dotnet_Core.Services; 6 | 7 | namespace Dotnet_Core.ArgHandlers { 8 | internal class TablesExpenseHandler { 9 | private readonly TextractTextAnalysisService textractAnalysisService; 10 | 11 | public TablesExpenseHandler(TextractTextAnalysisService textractAnalysisService) { 12 | this.textractAnalysisService = textractAnalysisService; 13 | } 14 | 15 | internal void Handle(string bucketName, string expenseFile) { 16 | var task = textractAnalysisService.StartDocumentAnalysis(bucketName, expenseFile, "TABLES"); 17 | var jobId = task.Result; 18 | textractAnalysisService.WaitForJobCompletion(jobId); 19 | var results = textractAnalysisService.GetJobResults(jobId); 20 | var warnings = new StringBuilder(); 21 | float expense; 22 | var lineItem = new List(); 23 | var document = new TextractDocument(results); 24 | document.Pages.ForEach(page => { 25 | page.Tables.ForEach(table => { 26 | var r = 0; 27 | table.Rows.ForEach(row => { 28 | r++; 29 | var itemName = string.Empty; 30 | var c = 0; 31 | row.Cells.ForEach(cell => { 32 | c++; 33 | Console.WriteLine("Table [{0}][{1}] = {2}", r, c, cell.Text); 34 | if(c == 1) { 35 | itemName = cell.Text; 36 | } else if(c == 5 && float.TryParse(cell.Text, out expense)) { 37 | if(expense > 100) { 38 | warnings.AppendFormat("{0} is greater than $100{1}", itemName, Environment.NewLine); 39 | } 40 | } 41 | }); 42 | }); 43 | }); 44 | }); 45 | Console.WriteLine(string.Format("{0}===Warnings==={0}{1}===", Environment.NewLine, warnings)); 46 | } 47 | } 48 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/TablesHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Amazon.Textract.Model; 3 | using Dotnet_Core.Services; 4 | 5 | namespace Dotnet_Core.ArgHandlers { 6 | internal class TablesHandler { 7 | private readonly TextractTextAnalysisService textractAnalysisService; 8 | 9 | public TablesHandler(TextractTextAnalysisService textractAnalysisService) { 10 | this.textractAnalysisService = textractAnalysisService; 11 | } 12 | 13 | internal void Handle(string bucketName, string formFile) { 14 | var task = textractAnalysisService.StartDocumentAnalysis(bucketName, formFile, "TABLES"); 15 | var jobId = task.Result; 16 | textractAnalysisService.WaitForJobCompletion(jobId); 17 | var results = textractAnalysisService.GetJobResults(jobId); 18 | var document = new TextractDocument(results); 19 | document.Pages.ForEach(page => { 20 | page.Tables.ForEach(table => { 21 | var r = 0; 22 | table.Rows.ForEach(row => { 23 | r++; 24 | var c = 0; 25 | row.Cells.ForEach(cell => { 26 | c++; 27 | Console.WriteLine("Table [{0}][{1}] = {2}", r, c, cell.Text); 28 | }); 29 | }); 30 | }); 31 | }); 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /src-csharp/ArgHandlers/TranslateHandler.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Text; 3 | using Dotnet_Core.Services; 4 | 5 | namespace Dotnet_Core.ArgHandlers { 6 | internal class TranslateHandler { 7 | private readonly TextractTextDetectionService textractTextService; 8 | private readonly TranslateService translateService; 9 | 10 | public TranslateHandler(TextractTextDetectionService textractTextService, TranslateService translateService) { 11 | this.textractTextService = textractTextService; 12 | this.translateService = translateService; 13 | } 14 | 15 | internal void Handle(string bucketName, string s3File) { 16 | var detectTextTask = textractTextService.DetectTextS3(bucketName, s3File); 17 | detectTextTask.Wait(); 18 | var blocks = detectTextTask.Result.Blocks; 19 | var sourceText = new StringBuilder(); 20 | blocks.ForEach(x => { 21 | if(x.BlockType == "LINE") { 22 | sourceText.AppendLine(x.Text); 23 | } 24 | }); 25 | Console.WriteLine(sourceText.ToString()); 26 | var translateTask = translateService.TranslateText(sourceText.ToString(), "en", "de"); 27 | translateTask.Wait(); 28 | Console.WriteLine(translateTask.Result.TranslatedText); 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /src-csharp/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Amazon.Textract; 3 | using Amazon.Translate; 4 | using Microsoft.Extensions.Configuration; 5 | using Amazon.Comprehend; 6 | using Amazon.ComprehendMedical; 7 | using Dotnet_Core.Services; 8 | using Dotnet_Core.ArgHandlers; 9 | 10 | namespace Dotnet_Core { 11 | partial class Program { 12 | 13 | const string BucketName = "textract-console-us-west-2-d92b0df4-a50a-4203-b070-044c3ee7fe83"; 14 | const string LocalEmploymentFile = "test-files/employmentapp.png"; 15 | const string LocalSimpleFile = "test-files/simple-document-image.jpg"; 16 | const string LocalMedicalFile = "test-files/medical-notes.png"; 17 | const string LocalFolder = "test-files"; 18 | const string S3File = "simple-document-image.jpg"; 19 | const string TwoColumnImage = "two-column-image.jpg"; 20 | const string PdfFile = "Amazon-Textract-Pdf.pdf"; 21 | const string FormFile = "employmentapp.png"; 22 | const string ExpenseFile = "expense.png"; 23 | const string ElasticSearchEndpoint = "https://search-textract-sample-hvthzep6bedgfdj6oxeng5jtmi.us-west-2.es.amazonaws.com"; 24 | const string ElasticSearchDomainName = "textract-sample"; 25 | 26 | static void Main(string[] args) { 27 | if(args.Length == 0) { 28 | Console.WriteLine(HelpText); 29 | return; 30 | } 31 | 32 | var firstArg = args[0]; 33 | 34 | var builder = new ConfigurationBuilder() 35 | .SetBasePath(Environment.CurrentDirectory) 36 | .AddJsonFile("appsettings.json", optional: false, reloadOnChange: true) 37 | .AddEnvironmentVariables() 38 | .Build(); 39 | var awsOptions = builder.GetAWSOptions(); 40 | Console.WriteLine(awsOptions.Profile + ":" + awsOptions.ProfilesLocation + ": " + awsOptions.Region.DisplayName); 41 | var textractTextService = new TextractTextDetectionService(awsOptions.CreateServiceClient()); 42 | var textractAnalysisService = new TextractTextAnalysisService(awsOptions.CreateServiceClient()); 43 | var translateService = new TranslateService(awsOptions.CreateServiceClient()); 44 | var comprehendService = new ComprehendService(awsOptions.CreateServiceClient()); 45 | var comprehendMedicalService = new ComprehendService(awsOptions.CreateServiceClient()); 46 | var elasticSearchService = new ElasticSearchService(ElasticSearchEndpoint, ElasticSearchDomainName); 47 | 48 | switch(firstArg) { 49 | case "--detect-text-local": 50 | new DetectTextHandler(textractTextService).Handle(LocalEmploymentFile); 51 | break; 52 | case "--detect-text-s3": 53 | new DetectTextS3Handler(textractTextService).Handle(BucketName, S3File); 54 | break; 55 | case "--pdf-text": 56 | new PdfTextHandler(textractTextService).Handle(BucketName, PdfFile); 57 | break; 58 | case "--reading-order": 59 | new ReadingOrderHandler(textractTextService).Handle(BucketName, TwoColumnImage); 60 | break; 61 | case "--translate": 62 | new TranslateHandler(textractTextService, translateService).Handle(BucketName, S3File); 63 | break; 64 | case "--search": 65 | new SearchHandler(textractTextService, elasticSearchService).Handle(BucketName, S3File); 66 | break; 67 | case "--forms": 68 | new FormsHandler(textractAnalysisService).Handle(BucketName, FormFile); 69 | break; 70 | case "--forms-redaction": 71 | new FormsRedactionHandler(textractAnalysisService).Handle(BucketName, FormFile, LocalFolder, LocalEmploymentFile); 72 | break; 73 | case "--tables": 74 | new TablesHandler(textractAnalysisService).Handle(BucketName, FormFile); 75 | break; 76 | case "--tables-expense": 77 | new TablesExpenseHandler(textractAnalysisService).Handle(BucketName, ExpenseFile); 78 | break; 79 | case "--nlp-comprehend": 80 | new NlpComprehendHandler(textractTextService, comprehendService).Handle(LocalSimpleFile); 81 | break; 82 | case "--nlp-medical": 83 | new NlpComprehendMedicalHandler(textractTextService, comprehendMedicalService).Handle(LocalMedicalFile); 84 | break; 85 | default: 86 | Console.WriteLine(HelpText); 87 | break; 88 | } 89 | } 90 | 91 | const string HelpText = @" 92 | Usage: dotnet run [--switch] 93 | To run this console app, use the following valid switches one at a time: 94 | --detect-text-local 95 | --detect-text-s3 96 | --pdf-text 97 | --forms 98 | --forms-redaction 99 | --tables 100 | --tables-expense 101 | --reading-order 102 | --nlp-comprehend 103 | --nlp-medical 104 | --translate 105 | --search 106 | e.g. dotnet run --detect-text-s3 107 | "; 108 | } 109 | } -------------------------------------------------------------------------------- /src-csharp/Readme.md: -------------------------------------------------------------------------------- 1 | # C# .NET Core implementation 2 | 3 | Amazon Textract samples for .NET Core with C# 4 | 5 | ## Prerequisites 6 | 7 | - [Dotnet Core 2.2](https://dotnet.microsoft.com/download/dotnet-core/2.2) 8 | - [AWS CLI](https://docs.aws.amazon.com/polly/latest/dg/setup-aws-cli.html) for 9 | running AWS CLI commands after configuring a 10 | [default or named profile](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) 11 | - Upload files from `test-files` to a target Amazon S3 bucket in your account 12 | 13 | ## Usage 14 | 15 | ``` 16 | Usage: dotnet run [--switch] 17 | To run this console app, use the following valid switches one at a time: 18 | --detect-text-local 19 | --detect-text-s3 20 | --pdf-text 21 | --forms 22 | --forms-redaction 23 | --tables 24 | --tables-expense 25 | --reading-order 26 | --nlp-comprehend 27 | --nlp-medical 28 | --translate 29 | --search 30 | e.g. dotnet run --detect-text-s3 31 | ``` 32 | 33 | ## Samples 34 | 35 | | Argument | Description | 36 | | ------------------- | ---------------------------------------------------------- | 37 | | --detect-text-local | Example showing processing a document on local machine. | 38 | | --detect-text-s3 | Example showing processing a document in Amazon S3 bucket. | 39 | | --pdf-text | Example showing PDF document processing. | 40 | | --forms | Example showing form (key/value) processing. | 41 | | --forms-redaction | Example showing redacting information in document. | 42 | | --tables | Example showing table processing. | 43 | | --tables-expense | Example showing validation of table data. | 44 | | --reading-order | Example showing printing document in reading order. | 45 | | --nlp-comprehend | Example showing detecting entities and sentiment. | 46 | | --nlp-medical | Example showing detecting medical entities. | 47 | | --translate | Example showing translation of documents. | 48 | | --search | Example showing document indexing in Elasticsearch. | 49 | 50 | Example usage and result 51 | 52 | ``` 53 | dotnet-core sanjeet$ dotnet run --forms 54 | default:: US West (Oregon) 55 | ........Key: Phone Number:, Value 555-0100 56 | Key: Full Name:, Value Jane Doe 57 | Key: Home Address:, Value 123 Any Street. Any Town, USA 58 | Key: Mailing Address:, Value same as home address 59 | Get Field by Key: 60 | Key: Phone Number:, Value: 555-0100 61 | ``` 62 | 63 | The following source document was used by the example above to analyze Form 64 | data. This document has a Form and a Table on it: 65 | 66 | ![source document](test-files/employmentapp.png) 67 | 68 | The following AWS services are used: 69 | 70 | - Amazon Textract (for text extraction and analysis) 71 | - Amazon Comprehend (for natural language processing) 72 | - Amazon Comprehend Medical (for natural language processing of medical 73 | prescriptions/documents) 74 | - Amazon Elasticsearch (for full text indexing and search) 75 | - Amazon S3 (for storing scanned documents/images used by Amazon Textract) 76 | - Amazon Translate (for translating text from English to other supported 77 | languages) 78 | 79 | ## Dependencies 80 | 81 | appsettings.json file uses your default AWS profile so that you don't have to 82 | set AWS credentials in clear text 83 | 84 | ``` 85 | { 86 | "AWS": { 87 | "Profile": "default", 88 | "Region": "us-west-2" 89 | } 90 | } 91 | ``` 92 | 93 | ### A quick walkthrough of the .csproj file 94 | 95 | dotnet-core.csproj file: required .NET libraries - these libraries will be 96 | auto-installed as part of the build process 97 | 98 | ``` 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | ``` 113 | 114 | The code sample that performs redaction on a form uses Dotnet Core 115 | System.Drawing.Commons package. You can add System.Drawing.Common package to the 116 | project by using the following dotnet CLI command 117 | 118 | ``` 119 | dotnet add package System.Drawing.Common --version 4.5.1 120 | ``` 121 | 122 | [NEST](https://github.com/elastic/elasticsearch-net) is the official 123 | Elasticsearch client that's used by this sample to send text for indexing in an 124 | Amazon Elasticsearch domain provisioned in AWS. Use the following command to 125 | install NEST 126 | 127 | ``` 128 | dotnet add package NEST --version 6.8.0 129 | ``` 130 | 131 | AWSSDK.\* packages are Nuget client libraries, and can be installed using a 132 | command similar to the following 133 | 134 | ``` 135 | dotnet add package AWSSDK. 136 | ``` 137 | 138 | dotnet-core.csproj file: "test-files" folder has all the required files e.g. 139 | pdf, jpg, and png used for testing and they are all copied to the output 140 | directory 141 | 142 | ``` 143 | 144 | 145 | Always 146 | 147 | 148 | 149 | 150 | 151 | Always 152 | 153 | 154 | ``` 155 | 156 | To recursively copy files from local disk to S3 use the following command 157 | 158 | ``` 159 | aws s3 cp dotnet-core/test-files s3:/// --include "*" --recursive 160 | ``` 161 | 162 | ## Pro tips 163 | 164 | If you ever encounter the following error while running this .NET core 165 | application in MacOS 166 | 167 | ``` 168 | Unhandled Exception: System.TypeInitializationException: The type initializer for 'Gdip' threw an exception. ---> System.DllNotFoundException: Unable to load DLL 'libgdiplus': The specified module could not be found. 169 | at System.Runtime.InteropServices.FunctionWrapper`1.get_Delegate() 170 | at System.Drawing.SafeNativeMethods.Gdip.GdiplusStartup(IntPtr& token, StartupInput& input, StartupOutput& output) 171 | at System.Drawing.SafeNativeMethods.Gdip..cctor() 172 | --- End of inner exception stack trace --- 173 | ``` 174 | 175 | Install this package 176 | 177 | ``` 178 | brew install mono-libgdiplus 179 | ``` 180 | 181 | and if you see the following error 182 | 183 | ``` 184 | Unhandled Exception: System.ArgumentException: Parameter is not valid. 185 | at System.Drawing.Graphics.DrawRectangle(Pen pen, Int32 x, Int32 y, Int32 width, Int32 height) 186 | ``` 187 | 188 | ensure that the following is commented out (grpahics routine is disposed even 189 | before it gets an opportunity to DrawRectangle that's why you get the error) 190 | 191 | ``` 192 | // graphics.Dispose(); 193 | // image.Dispose(); 194 | ``` 195 | -------------------------------------------------------------------------------- /src-csharp/Services/Column.cs: -------------------------------------------------------------------------------- 1 | namespace Dotnet_Core.Services { 2 | internal class Column { 3 | public float Left { get; set; } 4 | public float Right { get; set; } 5 | 6 | public override string ToString() { 7 | return string.Format("Left: {0}, Right :{1}", this.Left, this.Right); 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /src-csharp/Services/ComprehendService.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Threading.Tasks; 4 | using Amazon.Comprehend; 5 | using Amazon.Comprehend.Model; 6 | using Amazon.ComprehendMedical; 7 | using MedicalModel = Amazon.ComprehendMedical.Model; 8 | 9 | namespace Dotnet_Core.Services { 10 | public class ComprehendService { 11 | private IAmazonComprehend comprehend { get; } 12 | private IAmazonComprehendMedical comprehendMedical { get; } 13 | public ComprehendService(IAmazonComprehend comprehend) { 14 | this.comprehend = comprehend; 15 | } 16 | 17 | public ComprehendService(IAmazonComprehendMedical comprehend) { 18 | this.comprehendMedical = comprehend; 19 | } 20 | 21 | public async Task DetectSentiment(string languageCode, string text) { 22 | var task = await this.comprehend.DetectSentimentAsync(new DetectSentimentRequest { 23 | LanguageCode = languageCode, 24 | Text = text 25 | }); 26 | return new DetectedSentiment { 27 | Sentiment = task.Sentiment, 28 | Mixed = task.SentimentScore.Mixed, 29 | Neutral = task.SentimentScore.Neutral, 30 | Negative = task.SentimentScore.Negative, 31 | Positive = task.SentimentScore.Positive 32 | }; 33 | } 34 | 35 | public async Task> DetectEntities(string languageCode, string text) { 36 | var task = await this.comprehend.DetectEntitiesAsync(new DetectEntitiesRequest { 37 | LanguageCode = languageCode, 38 | Text = text 39 | }); 40 | return task.Entities; 41 | } 42 | 43 | public async Task> DetectEntities(string text) { 44 | var task = await this.comprehendMedical.DetectEntitiesAsync(new MedicalModel.DetectEntitiesRequest { 45 | Text = text 46 | }); 47 | return task.Entities; 48 | } 49 | 50 | public class DetectedSentiment { 51 | public string Sentiment { get; set; } 52 | public float Mixed { get; set; } 53 | public float Positive { get; set; } 54 | public float Neutral { get; set; } 55 | public float Negative { get; set; } 56 | 57 | public override string ToString() { 58 | return string.Format("Sentiment: {0}, Score: [Mixed: {1}, Positive: {2}, Negative: {3}, Neutral: {4}]", this.Sentiment, this.Mixed, this.Positive, this.Negative, this.Neutral); 59 | } 60 | 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src-csharp/Services/ElasticSearchService.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using Nest; 3 | 4 | namespace Dotnet_Core.Services { 5 | public class ElasticSearchService { 6 | // private string domainUri, defaultIndex; 7 | private ElasticClient elasticClient; 8 | public ElasticSearchService(string endpoint, string domainName) { 9 | var connectionSettings = new ConnectionSettings(new Uri(endpoint)); 10 | connectionSettings.DefaultIndex(domainName); 11 | this.elasticClient = new ElasticClient(connectionSettings); 12 | } 13 | 14 | public void Index(T item, string indexName) where T : class { 15 | this.elasticClient.Index(item, x => x.Index(indexName)); 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /src-csharp/Services/IndexedText.cs: -------------------------------------------------------------------------------- 1 | namespace Dotnet_Core.Services { 2 | internal class IndexedText { 3 | public int ColumnIndex { get; set; } 4 | public string Text { get; set; } 5 | 6 | public override string ToString() { 7 | return string.Format("[{0}] {1}", this.ColumnIndex, this.Text); 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /src-csharp/Services/TextractTextAnalysisService.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Threading.Tasks; 4 | using Amazon.Textract; 5 | using Amazon.Textract.Model; 6 | 7 | namespace Dotnet_Core.Services { 8 | public class TextractTextAnalysisService { 9 | private IAmazonTextract textract; 10 | public TextractTextAnalysisService(IAmazonTextract textract) { 11 | this.textract = textract; 12 | } 13 | public GetDocumentAnalysisResponse GetJobResults(string jobId) { 14 | var response = this.textract.GetDocumentAnalysisAsync(new GetDocumentAnalysisRequest { 15 | JobId = jobId 16 | }); 17 | response.Wait(); 18 | return response.Result; 19 | } 20 | 21 | public bool IsJobComplete(string jobId) { 22 | var response = this.textract.GetDocumentAnalysisAsync(new GetDocumentAnalysisRequest { 23 | JobId = jobId 24 | }); 25 | response.Wait(); 26 | return !response.Result.JobStatus.Equals("IN_PROGRESS"); 27 | } 28 | 29 | public async Task StartDocumentAnalysis(string bucketName, string key, string featureType) { 30 | var request = new StartDocumentAnalysisRequest(); 31 | var s3Object = new S3Object { 32 | Bucket = bucketName, 33 | Name = key 34 | }; 35 | request.DocumentLocation = new DocumentLocation { 36 | S3Object = s3Object 37 | }; 38 | request.FeatureTypes = new List { featureType }; 39 | var response = await this.textract.StartDocumentAnalysisAsync(request); 40 | return response.JobId; 41 | } 42 | 43 | public void WaitForJobCompletion(string jobId, int delay = 5000) { 44 | while(!IsJobComplete(jobId)) { 45 | this.Wait(delay); 46 | } 47 | } 48 | 49 | private void Wait(int delay = 5000) { 50 | Task.Delay(delay).Wait(); 51 | Console.Write("."); 52 | } 53 | 54 | public void PrintDebug(GetDocumentAnalysisResponse response) { 55 | response.Blocks.ForEach(y => { 56 | Console.WriteLine(""); 57 | Console.WriteLine(y.Id + ":" + y.BlockType + ":" + y.Text); 58 | if(y.BlockType == "KEY_VALUE_SET") { 59 | Console.WriteLine(" "); 60 | PrintBlock(y); 61 | Console.WriteLine(" "); 62 | } else if(y.BlockType == "TABLE") { 63 | Console.WriteLine(" "); 64 | PrintBlock(y); 65 | Console.WriteLine("
"); 66 | } else if(y.BlockType == "CELL") { 67 | Console.WriteLine(" "); 68 | PrintBlock(y); 69 | Console.WriteLine(" "); 70 | } 71 | Console.WriteLine("
"); 72 | }); 73 | } 74 | private void PrintBlock(Block block) { 75 | Console.WriteLine(" "); 76 | block.EntityTypes.ForEach(z => Console.WriteLine(" " + z)); 77 | Console.WriteLine(" "); 78 | block.Relationships.ForEach(z => { 79 | Console.WriteLine(" "); 80 | Console.WriteLine(" " + z.Type); 81 | Console.WriteLine(" "); 82 | z.Ids.ForEach(a => Console.WriteLine(" " + a)); 83 | Console.WriteLine(" "); 84 | Console.WriteLine(" "); 85 | }); 86 | } 87 | } 88 | } -------------------------------------------------------------------------------- /src-csharp/Services/TextractTextDetectionService.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.IO; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | using Amazon.Textract; 7 | using Amazon.Textract.Model; 8 | 9 | namespace Dotnet_Core.Services { 10 | public class TextractTextDetectionService { 11 | private IAmazonTextract textract; 12 | public TextractTextDetectionService(IAmazonTextract textract) { 13 | this.textract = textract; 14 | } 15 | 16 | public async Task StartDocumentTextDetection(string bucketName, string key) { 17 | var request = new StartDocumentTextDetectionRequest(); 18 | request.DocumentLocation = new DocumentLocation { 19 | S3Object = new S3Object { 20 | Bucket = bucketName, 21 | Name = key 22 | } 23 | }; 24 | var response = await this.textract.StartDocumentTextDetectionAsync(request); 25 | return response.JobId; 26 | } 27 | 28 | public async Task DetectTextLocal(string localPath) { 29 | var result = new DetectDocumentTextResponse(); 30 | 31 | if(File.Exists(localPath)) { 32 | var request = new DetectDocumentTextRequest(); 33 | request.Document = new Document { 34 | Bytes = new MemoryStream(File.ReadAllBytes(localPath)) 35 | }; 36 | return await this.textract.DetectDocumentTextAsync(request); 37 | } 38 | Console.WriteLine("File: " + localPath + " doesn't exist"); 39 | return result; 40 | } 41 | 42 | public void WaitForJobCompletion(string jobId, int delay = 5000) { 43 | while(!IsJobComplete(jobId)) { 44 | this.Wait(delay); 45 | } 46 | } 47 | 48 | public bool IsJobComplete(string jobId) { 49 | var response = this.textract.GetDocumentTextDetectionAsync(new GetDocumentTextDetectionRequest { 50 | JobId = jobId 51 | }); 52 | response.Wait(); 53 | return !response.Result.JobStatus.Equals("IN_PROGRESS"); 54 | } 55 | 56 | public List GetJobResults(string jobId) { 57 | var result = new List(); 58 | var response = this.textract.GetDocumentTextDetectionAsync(new GetDocumentTextDetectionRequest { 59 | JobId = jobId 60 | }); 61 | response.Wait(); 62 | result.Add(response.Result); 63 | var nextToken = response.Result.NextToken; 64 | while(nextToken != null) { 65 | this.Wait(); 66 | response = this.textract.GetDocumentTextDetectionAsync(new GetDocumentTextDetectionRequest { 67 | JobId = jobId, 68 | NextToken = response.Result.NextToken 69 | }); 70 | response.Wait(); 71 | result.Add(response.Result); 72 | nextToken = response.Result.NextToken; 73 | } 74 | return result; 75 | } 76 | 77 | private void Wait(int delay = 5000) { 78 | Task.Delay(delay).Wait(); 79 | Console.Write("."); 80 | } 81 | 82 | public async Task DetectTextS3(string bucketName, string key) { 83 | var result = new DetectDocumentTextResponse(); 84 | var s3Object = new S3Object { 85 | Bucket = bucketName, 86 | Name = key 87 | }; 88 | var request = new DetectDocumentTextRequest(); 89 | request.Document = new Document { 90 | S3Object = s3Object 91 | }; 92 | return await this.textract.DetectDocumentTextAsync(request); 93 | } 94 | 95 | private void Print(List blocks) { 96 | blocks.ForEach(x => { 97 | if(x.BlockType.Equals("LINE")) { 98 | Console.WriteLine(x.Text); 99 | } 100 | }); 101 | } 102 | 103 | public void Print(DetectDocumentTextResponse response) { 104 | if(response != null) { 105 | this.Print(response.Blocks); 106 | } 107 | } 108 | 109 | public void Print(List response) { 110 | if(response != null && response.Count > 0) { 111 | response.ForEach(r => this.Print(r.Blocks)); 112 | } 113 | } 114 | 115 | public List GetLines(DetectDocumentTextResponse result) { 116 | var lines = new List(); 117 | result.Blocks.FindAll(block => block.BlockType == "LINE").ForEach(block => lines.Add(block.Text)); 118 | return lines; 119 | } 120 | } 121 | } -------------------------------------------------------------------------------- /src-csharp/Services/TranslateService.cs: -------------------------------------------------------------------------------- 1 | using System.Threading.Tasks; 2 | using Amazon.Translate; 3 | using Amazon.Translate.Model; 4 | 5 | namespace Dotnet_Core.Services { 6 | public class TranslateService { 7 | private IAmazonTranslate translate; 8 | public TranslateService(IAmazonTranslate translate) { 9 | this.translate = translate; 10 | } 11 | 12 | public async Task TranslateText(string text, string sourceLanguage, string targetLanguage) { 13 | var request = new TranslateTextRequest { 14 | SourceLanguageCode = sourceLanguage, 15 | TargetLanguageCode = targetLanguage, 16 | Text = text 17 | }; 18 | 19 | return await this.translate.TranslateTextAsync(request); 20 | } 21 | } 22 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/Cell.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using Amazon.Textract.Model; 3 | 4 | namespace Amazon.Textract.Model { 5 | public class Cell { 6 | public Cell(Block block, Dictionary blocks) { 7 | if(block == null) 8 | return; 9 | this.Block = block; 10 | this.ColumnIndex = block.ColumnIndex; 11 | this.ColumnSpan = block.ColumnSpan; 12 | this.Confidence = block.Confidence; 13 | this.Content = new List(); 14 | this.Geometry = block.Geometry; 15 | this.Id = block.Id; 16 | this.RowIndex = block.RowIndex; 17 | this.RowSpan = block.RowSpan; 18 | this.Text = string.Empty; 19 | 20 | var relationships = block.Relationships; 21 | if(relationships != null && relationships.Count > 0) { 22 | relationships.ForEach(r => { 23 | if(r.Type == "CHILD") { 24 | r.Ids.ForEach(id => { 25 | var rb = blocks[id]; 26 | if(rb != null && rb.BlockType == "WORD") { 27 | var w = new Word(rb, blocks); 28 | this.Content.Add(w); 29 | this.Text = this.Text + w.Text + " "; 30 | } else if(rb != null && rb.BlockType == "SELECTION_ELEMENT") { 31 | var se = new SelectionElement(rb, blocks); 32 | this.Content.Add(se); 33 | this.Text = this.Text + se.SelectionStatus + ", "; 34 | } 35 | }); 36 | } 37 | 38 | }); 39 | } 40 | } 41 | public int RowIndex { get; set; } 42 | public int RowSpan { get; set; } 43 | public int ColumnIndex { get; set; } 44 | public int ColumnSpan { get; set; } 45 | public List Content { get; set; } 46 | public Block Block { get; set; } 47 | public float Confidence { get; set; } 48 | public Geometry Geometry { get; set; } 49 | public string Id { get; set; } 50 | public string Text { get; set; } 51 | 52 | public override string ToString() { 53 | return this.Text; 54 | } 55 | } 56 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/Field.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using Amazon.Textract.Model; 4 | 5 | namespace Amazon.Textract.Model { 6 | 7 | public class Field { 8 | public Field(Block block, Dictionary blocks) { 9 | var relationships = block.Relationships; 10 | if(relationships != null && relationships.Count > 0) { 11 | relationships.ForEach(r => { 12 | if(r.Type == "CHILD") { 13 | this.Key = new FieldKey(block, r.Ids, blocks); 14 | } else if(r.Type == "VALUE") { 15 | r.Ids.ForEach(id => { 16 | var v = blocks[id]; 17 | if(v.EntityTypes.Contains("VALUE")) { 18 | var vr = v.Relationships; 19 | if(vr != null && vr.Count > 0) { 20 | vr.ForEach(vc => { 21 | if(vc.Type == "CHILD") { 22 | this.Value = new FieldValue(v, vc.Ids, blocks); 23 | } 24 | }); 25 | } 26 | } 27 | }); 28 | } 29 | }); 30 | } 31 | } 32 | public FieldKey Key { get; set; } 33 | public FieldValue Value { get; set; } 34 | 35 | public override string ToString() { 36 | var k = this.Key == null ? string.Empty : this.Key.ToString(); 37 | var v = this.Value == null ? string.Empty : this.Value.ToString(); 38 | return string.Format(@" 39 | {0}Field{0}===={0} 40 | Key: {1}, Value: {2} 41 | ", Environment.NewLine, k, v); 42 | } 43 | } 44 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/FieldKey.cs: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | class FieldKey: 4 | def __init__(self, block, children, blockMap): 5 | self._block = block 6 | self._confidence = block['Confidence'] 7 | self._geometry = Geometry(block['Geometry']) 8 | self._id = block['Id'] 9 | self._text = "" 10 | self._content = [] 11 | 12 | t = [] 13 | 14 | for eid in children: 15 | wb = blockMap[eid] 16 | if(wb['BlockType'] == "WORD"): 17 | w = Word(wb, blockMap) 18 | self._content.append(w) 19 | t.append(w.text) 20 | 21 | if(t): 22 | self._text = ' '.join(t) 23 | 24 | def __str__(self): 25 | return self._text 26 | 27 | @property 28 | def confidence(self): 29 | return self._confidence 30 | 31 | @property 32 | def geometry(self): 33 | return self._geometry 34 | 35 | @property 36 | def id(self): 37 | return self._id 38 | 39 | @property 40 | def content(self): 41 | return self._content 42 | 43 | @property 44 | def text(self): 45 | return self._text 46 | 47 | @property 48 | def block(self): 49 | return self._block 50 | 51 | */ 52 | 53 | using System.Collections.Generic; 54 | using Amazon.Textract.Model; 55 | 56 | namespace Amazon.Textract.Model { 57 | public class FieldKey { 58 | public FieldKey(Block block, List children, Dictionary blocks) { 59 | this.Block = block; 60 | this.Confidence = block.Confidence; 61 | this.Geometry = block.Geometry; 62 | this.Id = block.Id; 63 | this.Text = string.Empty; 64 | this.Content = new List(); 65 | 66 | var words = new List(); 67 | 68 | if(children != null && children.Count > 0) { 69 | children.ForEach(c => { 70 | var wordBlock = blocks[c]; 71 | if(wordBlock.BlockType == "WORD") { 72 | var w = new Word(wordBlock, blocks); 73 | this.Content.Add(w); 74 | words.Add(w.Text); 75 | } 76 | }); 77 | } 78 | 79 | if(words.Count > 0) { 80 | this.Text = string.Join(" ", words); 81 | } 82 | 83 | } 84 | public List Content { get; set; } 85 | public Block Block { get; set; } 86 | public float Confidence { get; set; } 87 | public Geometry Geometry { get; set; } 88 | public string Id { get; set; } 89 | public string Text { get; set; } 90 | 91 | public override string ToString() { 92 | return Text; 93 | } 94 | } 95 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/FieldValue.cs: -------------------------------------------------------------------------------- 1 | 2 | 3 | /* 4 | 5 | class FieldValue: 6 | def __init__(self, block, children, blockMap): 7 | self._block = block 8 | self._confidence = block['Confidence'] 9 | self._geometry = Geometry(block['Geometry']) 10 | self._id = block['Id'] 11 | self._text = "" 12 | self._content = [] 13 | 14 | t = [] 15 | 16 | for eid in children: 17 | wb = blockMap[eid] 18 | if(wb['BlockType'] == "WORD"): 19 | w = Word(wb, blockMap) 20 | self._content.append(w) 21 | t.append(w.text) 22 | elif(wb['BlockType'] == "SELECTION_ELEMENT"): 23 | se = SelectionElement(wb, blockMap) 24 | self._content.append(se) 25 | self._text = se.selectionStatus 26 | 27 | if(t): 28 | self._text = ' '.join(t) 29 | 30 | def __str__(self): 31 | return self._text 32 | 33 | @property 34 | def confidence(self): 35 | return self._confidence 36 | 37 | @property 38 | def geometry(self): 39 | return self._geometry 40 | 41 | @property 42 | def id(self): 43 | return self._id 44 | 45 | @property 46 | def content(self): 47 | return self._content 48 | 49 | @property 50 | def text(self): 51 | return self._text 52 | 53 | @property 54 | def block(self): 55 | return self._block 56 | 57 | */ 58 | 59 | using System.Collections.Generic; 60 | using Amazon.Textract.Model; 61 | 62 | namespace Amazon.Textract.Model { 63 | public class FieldValue { 64 | public FieldValue(Block block, List children, Dictionary blocks) { 65 | this.Block = block; 66 | this.Confidence = block.Confidence; 67 | this.Geometry = block.Geometry; 68 | this.Id = block.Id; 69 | this.Text = string.Empty; 70 | this.Content = new List(); 71 | 72 | var words = new List(); 73 | if(children != null && children.Count > 0) { 74 | children.ForEach(c => { 75 | var wordBlock = blocks[c]; 76 | if(wordBlock.BlockType == "WORD") { 77 | var w = new Word(wordBlock, blocks); 78 | this.Content.Add(w); 79 | words.Add(w.Text); 80 | } else if(wordBlock.BlockType == "SELECTION_ELEMENT") { 81 | var selection = new SelectionElement(wordBlock, blocks); 82 | this.Content.Add(selection); 83 | words.Add(selection.SelectionStatus); 84 | } 85 | }); 86 | } 87 | 88 | if(words.Count > 0) { 89 | this.Text = string.Join(" ", words); 90 | } 91 | } 92 | public List Content { get; set; } 93 | public Block Block { get; set; } 94 | public float Confidence { get; set; } 95 | public Geometry Geometry { get; set; } 96 | public string Id { get; set; } 97 | public string Text { get; set; } 98 | 99 | public override string ToString() { 100 | return Text; 101 | } 102 | } 103 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/Form.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | 3 | namespace Amazon.Textract.Model { 4 | 5 | public class Form { 6 | public List Fields { get; set; } 7 | private Dictionary fieldMap; 8 | 9 | public Form() { 10 | this.Fields = new List(); 11 | this.fieldMap = new Dictionary(); 12 | } 13 | 14 | public void AddField(Field field) { 15 | this.Fields.Add(field); 16 | this.fieldMap.Add(field.Key.ToString(), field); 17 | } 18 | //public Field GetFieldByKey(string key) { 19 | // return this.fieldMap.GetValueOrDefault(key); 20 | //} 21 | 22 | public List SearchFieldsByKey(string key) { 23 | return this.Fields.FindAll(f => f.Key.ToString().ToLower().Contains(key.ToLower())); 24 | } 25 | 26 | public override string ToString() { 27 | return string.Join("\n", this.Fields); 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/Line.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using Amazon.Textract.Model; 4 | 5 | /* 6 | class Line: 7 | def __init__(self, block, blockMap): 8 | 9 | self._block = block 10 | self._confidence = block['Confidence'] 11 | self._geometry = Geometry(block['Geometry']) 12 | self._id = block['Id'] 13 | 14 | self._text = "" 15 | if(block['Text']): 16 | self._text = block['Text'] 17 | 18 | self._words = [] 19 | if('Relationships' in block and block['Relationships']): 20 | for rs in block['Relationships']: 21 | if(rs['Type'] == 'CHILD'): 22 | for cid in rs['Ids']: 23 | if(blockMap[cid]["BlockType"] == "WORD"): 24 | self._words.append(Word(blockMap[cid], blockMap)) 25 | def __str__(self): 26 | s = "Line\n==========\n" 27 | s = s + self._text + "\n" 28 | s = s + "Words\n----------\n" 29 | for word in self._words: 30 | s = s + "[{}]".format(str(word)) 31 | return s 32 | 33 | @property 34 | def confidence(self): 35 | return self._confidence 36 | 37 | @property 38 | def geometry(self): 39 | return self._geometry 40 | 41 | @property 42 | def id(self): 43 | return self._id 44 | 45 | @property 46 | def words(self): 47 | return self._words 48 | 49 | @property 50 | def text(self): 51 | return self._text 52 | 53 | @property 54 | def block(self): 55 | return self._block 56 | */ 57 | 58 | namespace Amazon.Textract.Model { 59 | public class Line { 60 | public Line(Block block, Dictionary blocks) { 61 | this.Block = block; 62 | this.Confidence = block.Confidence; 63 | this.Geometry = block.Geometry; 64 | this.Id = block.Id; 65 | this.Text = block == null ? string.Empty : block.Text; 66 | this.Words = new List(); 67 | 68 | var relationships = block.Relationships; 69 | if(relationships != null && relationships.Count > 0) { 70 | relationships.ForEach(r => { 71 | if(r.Type == "CHILD") { 72 | r.Ids.ForEach(id => { 73 | var block = blocks[id]; 74 | if(block.BlockType == "WORD") 75 | this.Words.Add(new Word(block, blocks)); 76 | else 77 | this.Words.Add(new Word(null, blocks)); 78 | }); 79 | } 80 | }); 81 | } 82 | } 83 | 84 | public float Confidence { get; set; } 85 | public Geometry Geometry { get; set; } 86 | public string Id { get; set; } 87 | public List Words { get; set; } 88 | public string Text { get; set; } 89 | public Block Block { get; set; } 90 | 91 | public override string ToString() { 92 | return string.Format(@" 93 | Line{0}===={0} 94 | {1} {0} 95 | Words{0}----{0} 96 | {2}{0} 97 | ---- 98 | ", Environment.NewLine, this.Text, string.Join(", ", this.Words)); 99 | } 100 | } 101 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/NewBoundingBox.cs: -------------------------------------------------------------------------------- 1 | using Amazon.Textract.Model; 2 | 3 | namespace Amazon.Textract.Model { 4 | public class NewBoundingBox : BoundingBox { 5 | public NewBoundingBox(float width, float height, float left, float top) : base() { 6 | this.Width = width; 7 | this.Height = height; 8 | this.Left = left; 9 | this.Top = top; 10 | } 11 | 12 | public override string ToString() { 13 | return string.Format("width: {0}, height: {1}, left: {2}, top: {3}", Width, Height, Left, Top); 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/NewGeometry.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using Amazon.Textract.Model; 4 | 5 | namespace Amazon.Textract.Model { 6 | public class NewGeometry : Geometry { 7 | 8 | public NewGeometry(Geometry geometry) : base() { 9 | this.BoundingBox = geometry.BoundingBox; 10 | this.Polygon = geometry.Polygon; 11 | var bb = new NewBoundingBox(this.BoundingBox.Width, this.BoundingBox.Height, this.BoundingBox.Left, this.BoundingBox.Top); 12 | var pgs = new List(); 13 | Polygon.ForEach(pg => pgs.Add(new Point { 14 | X = pg.X, 15 | Y = pg.Y 16 | })); 17 | 18 | BoundingBox = bb; 19 | Polygon = pgs; 20 | } 21 | 22 | public override string ToString() { 23 | return string.Format("BoundingBox: {0}{1}", BoundingBox, Environment.NewLine); 24 | } 25 | 26 | 27 | } 28 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/Page.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using Amazon.Textract.Model; 4 | 5 | namespace Amazon.Textract.Model { 6 | 7 | public class Page { 8 | public Page(List blocks, Dictionary blockMap) { 9 | this.Blocks = blocks; 10 | this.Text = string.Empty; 11 | this.Lines = new List(); 12 | this.Form = new Form(); 13 | this.Tables = new List(); 14 | this.Content = new List(); 15 | 16 | blocks.ForEach(b => { 17 | if(b.BlockType == "PAGE") { 18 | this.Geometry = new NewGeometry(b.Geometry); 19 | this.Id = b.Id; 20 | } else if(b.BlockType == "LINE") { 21 | var l = new Line(b, blockMap); 22 | this.Lines.Add(l); 23 | this.Content.Add(l); 24 | this.Text = this.Text + l.Text + Environment.NewLine; 25 | } else if(b.BlockType == "TABLE") { 26 | var t = new Table(b, blockMap); 27 | this.Tables.Add(t); 28 | this.Content.Add(t); 29 | } else if(b.BlockType == "KEY_VALUE_SET") { 30 | if(b.EntityTypes.Contains("KEY")) { 31 | var f = new Field(b, blockMap); 32 | if(f.Key != null) { 33 | this.Form.AddField(f); 34 | this.Content.Add(f); 35 | } 36 | } 37 | } 38 | }); 39 | 40 | } 41 | 42 | public List GetLinesInReadingOrder() { 43 | var lines = new List(); 44 | var columns = new List(); 45 | this.Lines.ForEach(line => { 46 | var columnFound = false; 47 | for(var index = 0; index < columns.Count; index++) { 48 | var column = columns[index]; 49 | var bb = line.Geometry.BoundingBox; 50 | var bbLeft = bb.Left; 51 | var bbRight = bb.Left + bb.Width; 52 | var bbCentre = bb.Left + (bb.Width / 2); 53 | var columnCentre = column.Left + (column.Right / 2); 54 | 55 | if((bbCentre > column.Left && bbCentre < column.Right) || (columnCentre > bbLeft && columnCentre < bbRight)) { 56 | lines.Add(new IndexedText { ColumnIndex = index, Text = line.Text }); 57 | columnFound = true; 58 | break; 59 | } 60 | } 61 | if(!columnFound) { 62 | var bb = line.Geometry.BoundingBox; 63 | columns.Add(new Column { Left = bb.Left, Right = bb.Left + bb.Width }); 64 | lines.Add(new IndexedText { ColumnIndex = columns.Count - 1, Text = line.Text }); 65 | } 66 | }); 67 | lines.FindAll(line => line.ColumnIndex == 0).ForEach(line => Console.WriteLine(line)); 68 | return lines; 69 | } 70 | 71 | public string GetTextInReadingOrder() { 72 | var lines = this.GetLinesInReadingOrder(); 73 | var text = string.Empty; 74 | lines.ForEach(line => { 75 | text = text + line.Text + "\n"; 76 | }); 77 | return text; 78 | } 79 | 80 | 81 | public List Blocks { get; set; } 82 | public string Text { get; set; } 83 | public List Lines { get; set; } 84 | public Form Form { get; set; } 85 | public List
Tables { get; set; } 86 | public List Content { get; set; } 87 | public Geometry Geometry { get; set; } 88 | public string Id { get; set; } 89 | 90 | public override string ToString() { 91 | var result = new List(); 92 | result.Add(string.Format("Page{0}===={0}", Environment.NewLine)); 93 | this.Content.ForEach(c => { 94 | result.Add($"{Environment.NewLine}{c}"); 95 | }); 96 | return string.Join("", result); 97 | } 98 | 99 | public class Column { 100 | public float Left { get; set; } 101 | public float Right { get; set; } 102 | 103 | public override string ToString() { 104 | return string.Format("Left: {0}, Right :{1}", this.Left, this.Right); 105 | } 106 | } 107 | 108 | public class IndexedText { 109 | public int ColumnIndex { get; set; } 110 | public string Text { get; set; } 111 | 112 | public override string ToString() { 113 | return string.Format("[{0}] {1}", this.ColumnIndex, this.Text); 114 | } 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/Row.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | 3 | namespace Amazon.Textract.Model { 4 | public class Row { 5 | public Row() { 6 | this.Cells = new List(); 7 | } 8 | public List Cells { get; set; } 9 | 10 | public override string ToString() { 11 | var result = new List(); 12 | this.Cells.ForEach(c => { 13 | result.Add(string.Format("[{0}]", c)); 14 | }); 15 | return string.Join("", result); 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/SelectionElement.cs: -------------------------------------------------------------------------------- 1 | /* 2 | class SelectionElement: 3 | def __init__(self, block, blockMap): 4 | self._confidence = block['Confidence'] 5 | self._geometry = Geometry(block['Geometry']) 6 | self._id = block['Id'] 7 | self._selectionStatus = block['SelectionStatus'] 8 | 9 | @property 10 | def confidence(self): 11 | return self._confidence 12 | 13 | @property 14 | def geometry(self): 15 | return self._geometry 16 | 17 | @property 18 | def id(self): 19 | return self._id 20 | 21 | @property 22 | def selectionStatus(self): 23 | return self._selectionStatus 24 | */ 25 | 26 | using System.Collections.Generic; 27 | using Amazon.Textract.Model; 28 | 29 | namespace Amazon.Textract.Model { 30 | public class SelectionElement { 31 | public SelectionElement(Block block, Dictionary blocks) { 32 | this.Confidence = block.Confidence; 33 | this.Geometry = block.Geometry; 34 | this.Id = block.Id; 35 | this.SelectionStatus = block.SelectionStatus; 36 | } 37 | public float Confidence { get; set; } 38 | public Geometry Geometry { get; set; } 39 | public string Id { get; set; } 40 | public string SelectionStatus { get; set; } 41 | 42 | } 43 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/Table.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using Amazon.Textract.Model; 4 | 5 | namespace Amazon.Textract.Model { 6 | public class Table { 7 | public Table(Block block, Dictionary blocks) { 8 | this.Block = block; 9 | this.Confidence = block.Confidence; 10 | this.Geometry = block.Geometry; 11 | this.Id = block.Id; 12 | this.Rows = new List(); 13 | var ri = 1; 14 | var row = new Row(); 15 | 16 | var relationships = block.Relationships; 17 | if(relationships != null && relationships.Count > 0) { 18 | relationships.ForEach(r => { 19 | if(r.Type == "CHILD") { 20 | r.Ids.ForEach(id => { 21 | var cell = new Cell(blocks[id], blocks); 22 | if(cell.RowIndex > ri) { 23 | this.Rows.Add(row); 24 | row = new Row(); 25 | ri = cell.RowIndex; 26 | } 27 | row.Cells.Add(cell); 28 | }); 29 | if(row != null && row.Cells.Count > 0) 30 | this.Rows.Add(row); 31 | } 32 | }); 33 | } 34 | } 35 | public List Rows { get; set; } 36 | public Block Block { get; set; } 37 | public float Confidence { get; set; } 38 | public Geometry Geometry { get; set; } 39 | public string Id { get; set; } 40 | 41 | public override string ToString() { 42 | var result = new List(); 43 | result.Add(string.Format("Table{0}===={0}", Environment.NewLine)); 44 | this.Rows.ForEach(r => { 45 | result.Add(string.Format("Row{0}===={0}{1}{0}", Environment.NewLine, r)); 46 | }); 47 | return string.Join("", result); 48 | } 49 | } 50 | } -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/TextractDocument.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using Amazon.Textract.Model; 3 | 4 | namespace Amazon.Textract.Model { 5 | public class TextractDocument { 6 | private Dictionary blockMap = new Dictionary(); 7 | private List> documentPages = new List>(); 8 | 9 | public TextractDocument(GetDocumentAnalysisResponse response) { 10 | this.Pages = new List(); 11 | this.ResponsePages = new List(); 12 | this.ResponsePages.Add(response); 13 | this.ParseDocumentPagesAndBlockMap(); 14 | this.Parse(); 15 | } 16 | 17 | private void ParseDocumentPagesAndBlockMap() { 18 | List documentPage = null; 19 | this.ResponsePages.ForEach(page => { 20 | page.Blocks.ForEach(block => { 21 | this.blockMap.Add(block.Id, block); 22 | if(block.BlockType == "PAGE") 23 | { 24 | if (documentPage != null) 25 | { 26 | this.documentPages.Add(documentPage); 27 | } 28 | documentPage = new List(); 29 | documentPage.Add(block); 30 | } else { 31 | if (documentPage == null) 32 | { 33 | documentPage = new List(); 34 | } 35 | documentPage.Add(block); 36 | } 37 | }); 38 | }); 39 | 40 | if (documentPage != null) 41 | { 42 | this.documentPages.Add(documentPage); 43 | } 44 | } 45 | 46 | private void Parse() { 47 | this.documentPages.ForEach(documentPage => { 48 | var page = new Page(documentPage, this.blockMap); 49 | this.Pages.Add(page); 50 | }); 51 | } 52 | 53 | public Block GetBlockById(string blockId) { 54 | return this.blockMap[blockId]; 55 | } 56 | 57 | public List ResponsePages { get; set; } 58 | public List Pages { get; set; } 59 | public List> PageBlocks { 60 | get { 61 | return this.documentPages; 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src-csharp/TextractExtensions/Word.cs: -------------------------------------------------------------------------------- 1 | /* 2 | class Word: 3 | def __init__(self, block, blockMap): 4 | self._block = block 5 | self._confidence = block['Confidence'] 6 | self._geometry = Geometry(block['Geometry']) 7 | self._id = block['Id'] 8 | self._text = "" 9 | if(block['Text']): 10 | self._text = block['Text'] 11 | 12 | def __str__(self): 13 | return self._text 14 | 15 | @property 16 | def confidence(self): 17 | return self._confidence 18 | 19 | @property 20 | def geometry(self): 21 | return self._geometry 22 | 23 | @property 24 | def id(self): 25 | return self._id 26 | 27 | @property 28 | def text(self): 29 | return self._text 30 | 31 | @property 32 | def block(self): 33 | return self._block 34 | */ 35 | 36 | using System.Collections.Generic; 37 | using Amazon.Textract.Model; 38 | 39 | namespace Amazon.Textract.Model { 40 | public class Word { 41 | public Word(Block block, Dictionary blocks) { 42 | this.Block = block ?? new Block(); 43 | this.Blocks = blocks ?? new Dictionary(); 44 | this.Confidence = block == null ? 0 : block.Confidence; 45 | this.Geometry = block == null ? new Geometry() : block.Geometry; 46 | this.Id = block == null ? string.Empty : block.Id; 47 | this.Text = block == null ? string.Empty : block.Text; 48 | } 49 | 50 | public Block Block { get; set; } 51 | public Dictionary Blocks { get; set; } 52 | public float Confidence { get; set; } 53 | public Geometry Geometry { get; set; } 54 | public string Id { get; set; } 55 | public string Text { get; set; } 56 | 57 | public override string ToString() { 58 | return Text; 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /src-csharp/appsettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWS": { 3 | "Profile": "default", 4 | "Region": "us-west-2" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /src-csharp/dotnet-core.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp2.2 6 | Dotnet_Core 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | Always 26 | 27 | 28 | 29 | 30 | 31 | Always 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /src-csharp/test-files/Amazon-Textract-Pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/Amazon-Textract-Pdf.pdf -------------------------------------------------------------------------------- /src-csharp/test-files/employmentapp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/employmentapp.png -------------------------------------------------------------------------------- /src-csharp/test-files/expense.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/expense.png -------------------------------------------------------------------------------- /src-csharp/test-files/medical-notes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/medical-notes.png -------------------------------------------------------------------------------- /src-csharp/test-files/redacted-employmentapp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/redacted-employmentapp.png -------------------------------------------------------------------------------- /src-csharp/test-files/simple-document-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/simple-document-image.jpg -------------------------------------------------------------------------------- /src-csharp/test-files/two-column-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-code-samples/3fb9e1cd3c0a8035aa623f0efe79a16a310f758a/src-csharp/test-files/two-column-image.jpg --------------------------------------------------------------------------------