├── Statement20230428.pdf ├── pipeline_architecture.png ├── CODE_OF_CONDUCT.md ├── classification.csv ├── src ├── cbaSavingsRowProcessor.py ├── rowProcessor.py ├── startJob.py ├── inputHandler.py ├── cbaCCRowProcessor.py ├── apiRequestHandler.py └── getResults.py ├── LICENSE ├── README.md ├── CONTRIBUTING.md ├── frontend └── index.html └── cloudFormation.yml /Statement20230428.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/textract-bank-statement-processor/HEAD/Statement20230428.pdf -------------------------------------------------------------------------------- /pipeline_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/textract-bank-statement-processor/HEAD/pipeline_architecture.png -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. -------------------------------------------------------------------------------- /classification.csv: -------------------------------------------------------------------------------- 1 | key,type,subtype 2 | paypal,Online,Paypal 3 | subway,Entertainment,Eating out 4 | noodle & sushi,Entertainment,Eating out 5 | vic roads,Transport,Car 6 | citylink,Transport,Toll 7 | pizza,Entertainment,Eating out 8 | wage,Employment,Acme Corporation 9 | bp,Transport,Petrol 10 | red rooster,Entertainment,Eating out 11 | 7-eleven,Groceries,7-Eleven 12 | belmont wines,Entertainment,Liquor 13 | beatie service,Transport,Car 14 | kfc,Entertainment,Eating out 15 | telstra,Utilities,Mobile 16 | wdl,Cash,ATM 17 | safeway,Groceries,Safeway 18 | picture box,Home,Artwork 19 | salvos,Clothing,Salvos 20 | cignall,Entertainment,Tobacco 21 | sw petrol,Transport,Petrol 22 | service centre,Transport,Car 23 | coles,Groceries,Coles 24 | kmart,Home,KMart 25 | bunnings,Home,Bunnings 26 | fee,Fees,Transaction fee 27 | transfer,Transfer,Internal Transfer 28 | -------------------------------------------------------------------------------- /src/cbaSavingsRowProcessor.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | from src.rowProcessor import TableRowProcessor 5 | 6 | class CBASavingsRowProcessor(TableRowProcessor): 7 | 8 | def process_row(self, row, statement_type, statement_name): 9 | print("Processing a CBA savings row") 10 | print(row) 11 | row['Date'] = super().get_date(statement_type, statement_name, row['Date']) 12 | if row['Credit']: 13 | row['Credit'] = super().convert_str_to_float(row['Credit']) 14 | row['Debit'] = 0 15 | if row['Debit']: 16 | row['Debit'] = super().convert_str_to_float(row['Debit']) 17 | row['Credit'] = 0 18 | del row['Balance'] 19 | row_no_blank_keys = {k: v for k, v in row.items() if k} 20 | print(row_no_blank_keys) 21 | return row_no_blank_keys 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | software and associated documentation files (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, copy, modify, 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /src/rowProcessor.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | from abc import ABC, abstractmethod 5 | from re import sub 6 | 7 | class TableRowProcessor(ABC): 8 | 9 | @abstractmethod 10 | def process_row(self, row, statement_type, statement_name): 11 | """Processes a row in a table extracted 12 | by Textract, into an object in the standard 13 | transaction format 14 | """ 15 | pass 16 | 17 | def get_date(self, statement_type, filename, date): 18 | year = int(filename[9:13]) 19 | if (statement_type == 'cba_bank'): 20 | mon = date[3:6] 21 | mon_last_year = ['Oct', 'Nov', 'Dec'] 22 | if mon in mon_last_year: 23 | year = year - 1 24 | return date + ' ' + str(year) 25 | elif (statement_type == 'cba_cc'): 26 | year = int(filename[9:13]) 27 | return date + ' ' + str(year) 28 | else: 29 | return date 30 | 31 | def convert_str_to_float(self, value): 32 | try: 33 | return float(sub(r'[^\d.]', '', value)) 34 | except Exception as e: 35 | print(e) 36 | return 0.0 37 | -------------------------------------------------------------------------------- /src/startJob.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | import json 5 | import time 6 | import boto3 7 | import sys 8 | 9 | textract_client = boto3.client('textract') 10 | s3_client = boto3.client('s3') 11 | 12 | def startJob(bucket_name, object_name): 13 | response = textract_client.start_document_analysis( 14 | DocumentLocation={ 15 | 'S3Object': { 16 | 'Bucket': bucket_name, 17 | 'Name': object_name, 18 | } 19 | }, 20 | FeatureTypes=['TABLES'], 21 | ) 22 | 23 | return response["JobId"] 24 | 25 | def handle(event, context): 26 | if event['eventSource'] != 'aws:s3': 27 | print("ERROR: Unexpected event type") 28 | print(json.dumps(event)) 29 | raise ValueError("ERROR: Unexpected event type") 30 | 31 | bucket_name = event['s3']['bucket']['name'] 32 | key = event['s3']['object']['key'] 33 | 34 | response = s3_client.head_object(Bucket=bucket_name, Key=key) 35 | 36 | print('Response: {}'.format(response)) 37 | 38 | print(f"StartJob: s3://{bucket_name}/{key}") 39 | statement_type = response['Metadata']['statement_type'] 40 | 41 | job_id = startJob(bucket_name, key) 42 | print(f"JobId: {job_id}") 43 | 44 | return { 45 | "bucket_name": bucket_name, 46 | "object_name": key, 47 | "job_id": job_id, 48 | "job_start_timestamp": time.time(), 49 | "statement_type": statement_type, 50 | } 51 | 52 | if __name__ == "__main__": 53 | import sys 54 | with open(sys.argv[1], "rt") as f: 55 | event = json.load(f) 56 | ret = handler(event, {}) 57 | print(json.dumps(ret, indent=2)) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # textract-statement-processor 2 | 3 | A sample pipeline that takes as input bank statements, extracts transaction information from 4 | tables within the statements using Textract, stores, and classifies each transaction. 5 | 6 | 7 | ![Architecture](pipeline_architecture.png) 8 | 9 | 1. PDF bank statements that have been scanned, or downloaded from an online banking application, are uploaded to the Landing bucket in S3 10 | 2. The landing of the file in the S3 bucket triggers a Lambda function that starts the step function 11 | 3. The Lambda function starts the step function execution 12 | 4. The first step in the step function calls a Lambda to start a new Textract document analysis job 13 | 5. A new document analysis job is invoked with the uploaded PDF 14 | 6. The step function periodically calls a Lambda to get the job results 15 | 7. The Lambda checks with Textract, using the job identifier, whether the analysis job is complete 16 | 8. When the analysis job is complete the Lambda takes the output of the job, extracts the tabular data, and processes the transaction records into a JSON file which it then saves in the Processed bucket in S3. 17 | 9. An API Lambda queries the JSON files stored in the S3 bucket in response to a request from the API gateway. An additional classification step at this point classifies each transaction into a type and sub-type based on user configurable classification rules. 18 | 10. The API Gateway serves a RESTful API that a we frontend consumes to visualise transaction data 19 | 11. Finally, the visualisation output of multiple years worth of classified transaction data is visualised within a Sankey diagram as shown below, allowing users to see at a glance income vs expenditure. 20 | 12. ML models can be trained and run aginst historical transaction data 21 | -------------------------------------------------------------------------------- /src/inputHandler.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | import boto3 5 | import os 6 | import random 7 | import json 8 | import csv 9 | from re import sub 10 | 11 | client = boto3.client('stepfunctions') 12 | s3_client = boto3.client('s3') 13 | 14 | def convert_str_to_float(value): 15 | try: 16 | return float(sub(r'[^\d.]', '', value)) 17 | except Exception as e: 18 | print(e) 19 | return 0.0 20 | 21 | def handle(event, context): 22 | print(event) 23 | bucket_name = event['Records'][0]['s3']['bucket']['name'] 24 | key = event['Records'][0]['s3']['object']['key'] 25 | response = s3_client.head_object(Bucket=bucket_name, Key=key) 26 | print('Response: {}'.format(response)) 27 | statement_type = response['Metadata']['statement_type'] 28 | if statement_type == 'ing': 29 | handle_csv_statement(bucket_name, key) 30 | elif statement_type == 'cba_cc' or statement_type == 'cba_bank': 31 | stateMachineARN = os.environ['statemachine_arn'] 32 | response = client.start_execution( 33 | stateMachineArn=stateMachineARN, 34 | name='test-sf'+str(random.randint(10, 100000)), 35 | input=json.dumps(event['Records'][0]) 36 | ) 37 | 38 | def handle_csv_statement(bucket_name, key): 39 | print('Handle CSV statement') 40 | response = s3_client.get_object(Bucket=bucket_name, Key=key) 41 | contents = response['Body'].read().decode('utf-8').splitlines() 42 | 43 | result = [] 44 | csvReader = csv.DictReader(contents) 45 | for row in csvReader: 46 | result.append(row) 47 | 48 | for transaction in result: 49 | transaction['Debit'] = convert_str_to_float(transaction['Debit'].lstrip('-')) 50 | transaction['Credit'] = convert_str_to_float(transaction['Credit']) 51 | 52 | output_bucket = os.environ['OUTPUT_BUCKET'] 53 | output_prefix = os.environ['OUTPUT_PREFIX'] 54 | 55 | output_object_base = os.path.join(output_prefix, os.path.abspath(key)) 56 | 57 | output_object = f"{output_object_base}.json" 58 | s3_client.put_object( 59 | Bucket=output_bucket, 60 | Key=output_object, 61 | Body=(bytes(json.dumps(result).encode('UTF-8'))), 62 | ServerSideEncryption='AES256', 63 | ContentType='application/json', 64 | ) 65 | print(f"File saved to: s3://{output_bucket}/{output_object}") 66 | 67 | -------------------------------------------------------------------------------- /src/cbaCCRowProcessor.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | from src.rowProcessor import TableRowProcessor 5 | 6 | class CBACCRowProcessor(TableRowProcessor): 7 | 8 | def process_row(self, row, statement_type, statement_name): 9 | print(row) 10 | print("Processing a CBA CC row") 11 | row['Date'] = super().get_date(statement_type, statement_name, row['Date']) 12 | if 'Transaction Details' in row: 13 | row['Transaction'] = row['Transaction Details'] 14 | del row['Transaction Details'] 15 | if row['Amount (A$)'].endswith('-'): 16 | credit = row['Amount (A$)'][:-1] 17 | row['Credit'] = super().convert_str_to_float(credit) 18 | row['Debit'] = 0 19 | del row['Amount (A$)'] 20 | else: 21 | row['Debit'] = super().convert_str_to_float(row['Amount (A$)']) 22 | row['Credit'] = 0 23 | del row['Amount (A$)'] 24 | if 'Transaction details' in row: 25 | row['Transaction'] = row['Transaction details'] 26 | del row['Transaction details'] 27 | if row['Amount (A$)'].endswith('-'): 28 | credit = row['Amount (A$)'][:-1] 29 | row['Credit'] = super().convert_str_to_float(credit) 30 | row['Debit'] = 0 31 | del row['Amount (A$)'] 32 | else: 33 | row['Debit'] = super().convert_str_to_float(row['Amount (A$)']) 34 | row['Credit'] = 0 35 | del row['Amount (A$)'] 36 | if 'Transaction details Amount (A$)' in row: 37 | tokens = row['Transaction details Amount (A$)'].split() 38 | amount = tokens[-1] 39 | row['Debit'] = super().convert_str_to_float(amount) 40 | row['Credit'] = 0 41 | row['Transaction'] = row['Transaction details Amount (A$)'] 42 | del row['Transaction details Amount (A$)'] 43 | if 'Transaction Details Amount (A$)' in row: 44 | tokens = row['Transaction Details Amount (A$)'].split() 45 | amount = tokens[-1] 46 | row['Debit'] = super().convert_str_to_float(amount) 47 | row['Credit'] = 0 48 | row['Transaction'] = row['Transaction Details Amount (A$)'] 49 | del row['Transaction Details Amount (A$)'] 50 | 51 | row_no_blank_keys = {k: v for k, v in row.items() if k} 52 | print(row_no_blank_keys) 53 | return row_no_blank_keys 54 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. -------------------------------------------------------------------------------- /src/apiRequestHandler.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | import json 5 | import csv 6 | import boto3 7 | import os 8 | import re 9 | 10 | s3 = boto3.client('s3') 11 | bucket_name = os.environ['DATA_BUCKET'] 12 | 13 | classifications = [] 14 | with open ("classification.csv") as classification_csv: 15 | csv_reader = csv.DictReader(classification_csv) 16 | for row in csv_reader: 17 | classifications.append(row) 18 | 19 | def handle(event, context): 20 | objects = s3.list_objects_v2(Bucket=bucket_name) 21 | 22 | json_contents = [] 23 | for obj in objects['Contents']: 24 | key = obj['Key'] 25 | print(key) 26 | if key.endswith('.json'): # Only process JSON files 27 | response = s3.get_object(Bucket=bucket_name, Key=key) 28 | contents = response['Body'].read().decode('utf-8') 29 | print(contents) 30 | json_contents.extend(json.loads(contents)) 31 | 32 | type_total_dict = {} 33 | for transaction in json_contents: 34 | if 'Transaction' in transaction: 35 | classification = classify_transaction(transaction['Transaction']) 36 | type = classification['type'] 37 | sub_type = classification['subtype'] 38 | transaction['Type'] = type 39 | transaction['Subtype'] = sub_type 40 | 41 | try: 42 | if 'Credit' in transaction and transaction['Credit'] != "": 43 | if type in type_total_dict: 44 | if sub_type in type_total_dict[type]: 45 | type_total_dict[type][sub_type] = float("{:.2f}".format(type_total_dict[type][sub_type])) + get_amount(transaction['Credit']) 46 | else: 47 | type_total_dict[type][sub_type] = get_amount(transaction['Credit']) 48 | else: 49 | type_total_dict[type] = {} 50 | type_total_dict[type][sub_type] = get_amount(transaction['Credit']) 51 | if 'Debit' in transaction and transaction['Debit'] != "": 52 | if type in type_total_dict: 53 | if sub_type in type_total_dict[type]: 54 | type_total_dict[type][sub_type] = float("{:.2f}".format(type_total_dict[type][sub_type])) + get_amount(transaction['Debit']) 55 | else: 56 | type_total_dict[type][sub_type] = get_amount(transaction['Debit']) 57 | else: 58 | type_total_dict[type] = {} 59 | type_total_dict[type][sub_type] = get_amount(transaction['Debit']) 60 | except ValueError as ve: 61 | print(ve) 62 | 63 | print(type_total_dict) 64 | print(json.dumps(type_total_dict, indent = 4)) 65 | 66 | result = {'summary': type_total_dict, 'transactions': json_contents} 67 | 68 | return { 69 | 'statusCode': 200, 70 | 'headers': { 71 | 'Access-Control-Allow-Origin': '*' 72 | }, 73 | 'body': json.dumps(result) 74 | } 75 | 76 | def get_amount(amount_str): 77 | non_decimal = re.compile(r'[^\d.]+') 78 | return float("{:.2f}".format(float(non_decimal.sub('',str(amount_str)).replace(",", "")))) 79 | 80 | 81 | def classify_transaction(description): 82 | for classification in classifications: 83 | if classification['key'] in description.lower(): 84 | return classification 85 | return {'key': '', 'type': 'Unknown', 'subtype': 'Unclassified'} -------------------------------------------------------------------------------- /frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Textract Statement Insight 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 |
17 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 |
DateDescriptionTypeSubtypeDebitCredit
34 |
35 | 36 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /src/getResults.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | import os 5 | import csv 6 | import io 7 | import json 8 | import time 9 | import boto3 10 | import csv 11 | from collections import defaultdict 12 | 13 | from src.cbaCCRowProcessor import CBACCRowProcessor 14 | from src.cbaSavingsRowProcessor import CBASavingsRowProcessor 15 | 16 | cba_cc_row_processor = CBACCRowProcessor() 17 | cba_savings_row_processor = CBASavingsRowProcessor() 18 | 19 | s3_client = boto3.client('s3') 20 | textract_client = boto3.client('textract') 21 | 22 | def generate_table_csv(table_result, blocks_map, table_index): 23 | rows = get_rows_columns_map(table_result, blocks_map) 24 | csv = '' 25 | for row_index, cols in rows.items(): 26 | for col_index, text in cols.items(): 27 | csv += '{}'.format(text).strip() + "|" 28 | csv += '\n' 29 | return csv 30 | 31 | def get_rows_columns_map(table_result, blocks_map): 32 | rows = {} 33 | for relationship in table_result['Relationships']: 34 | if relationship['Type'] == 'CHILD': 35 | for child_id in relationship['Ids']: 36 | cell = blocks_map[child_id] 37 | if cell['BlockType'] == 'CELL': 38 | row_index = cell['RowIndex'] 39 | col_index = cell['ColumnIndex'] 40 | if row_index not in rows: 41 | # create new row 42 | rows[row_index] = {} 43 | # get the text value 44 | rows[row_index][col_index] = get_text(cell, blocks_map) 45 | return rows 46 | 47 | def get_text(result, blocks_map): 48 | text = '' 49 | if 'Relationships' in result: 50 | for relationship in result['Relationships']: 51 | if relationship['Type'] == 'CHILD': 52 | for child_id in relationship['Ids']: 53 | word = blocks_map[child_id] 54 | if word['BlockType'] == 'WORD': 55 | text += word['Text'] + ' ' 56 | if word['BlockType'] == 'SELECTION_ELEMENT': 57 | if word['SelectionStatus'] =='SELECTED': 58 | text += 'X ' 59 | return text 60 | 61 | def getJobResults(job_id, next_token = None): 62 | kwargs = {} 63 | if next_token: 64 | kwargs['NextToken'] = next_token 65 | 66 | response = textract_client.get_document_analysis(JobId=job_id, **kwargs) 67 | 68 | return response 69 | 70 | 71 | 72 | def handle(event, context): 73 | blocks_map = {} 74 | table_blocks = [] 75 | statement_name = event['object_name'] 76 | job_id = event['job_id'] 77 | statement_type = event['statement_type'] 78 | 79 | results = getJobResults(job_id) 80 | event['job_status'] = results['JobStatus'] 81 | event['job_update_timestamp'] = time.time() 82 | 83 | if event['job_status'] != "SUCCEEDED": 84 | if event['job_status'] != "IN_PROGRESS": 85 | event['results'] = results 86 | return event 87 | 88 | # Job succeeded - retrieve the results 89 | input_bucket = event['bucket_name'] 90 | input_object = event['object_name'] 91 | 92 | output_bucket = os.getenv('OUTPUT_BUCKET', input_bucket) 93 | output_prefix = os.environ['OUTPUT_PREFIX'] 94 | output_object_base = output_prefix + input_object 95 | 96 | event['output_bucket'] = output_bucket 97 | blocks = [] 98 | 99 | while True: 100 | if 'Blocks' in results: 101 | blocks.extend(results['Blocks']) 102 | for block in results['Blocks']: 103 | blocks_map[block['Id']] = block 104 | if block['BlockType'] == "TABLE": 105 | table_blocks.append(block) 106 | 107 | if 'NextToken' not in results: 108 | break 109 | 110 | print(f"NextToken: {results['NextToken']}") 111 | results = getJobResults(job_id, next_token=results['NextToken']) 112 | 113 | rows = [] 114 | for index, table in enumerate(table_blocks): 115 | table_csv = generate_table_csv(table, blocks_map, index +1) 116 | 117 | print(table_csv) 118 | 119 | if (table_csv.startswith('Date')): 120 | data = io.StringIO(table_csv.strip()) 121 | 122 | for row in csv.DictReader(data, delimiter="|", quoting=csv.QUOTE_NONE): 123 | try: 124 | #Use statement type from S3 metadata to determine which row processor to use 125 | if statement_type == 'cba_cc': 126 | rows.append(cba_cc_row_processor.process_row(row, statement_type, statement_name)) 127 | elif statement_type == 'cba_bank': 128 | rows.append(cba_savings_row_processor.process_row(row, statement_type, statement_name)) 129 | except Exception as e: 130 | print(e) 131 | 132 | output_object = f"{output_object_base}.json" 133 | s3_client.put_object( 134 | Bucket=output_bucket, 135 | Key=output_object, 136 | Body=(bytes(json.dumps(rows).encode('UTF-8'))), 137 | ServerSideEncryption='AES256', 138 | ContentType='application/json', 139 | ) 140 | print(f"Blocks file saved to: s3://{output_bucket}/{output_object}") 141 | event['blocks'] = output_object 142 | 143 | return event -------------------------------------------------------------------------------- /cloudFormation.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: The AWS CloudFormation template for the statement processing pipeline 3 | 4 | Parameters: 5 | Stage: 6 | Type: String 7 | Description: The deployment stage 8 | BucketPostfix: 9 | Type: String 10 | Description: A postfix to ensure uniqueness of bucket naming 11 | DeploymentBucket: 12 | Type: String 13 | Description: Name of the bucket containing the lambda deployment package zip 14 | Resources: 15 | StartStateMachineLogGroup: 16 | Type: 'AWS::Logs::LogGroup' 17 | Properties: 18 | LogGroupName: !Sub '/aws/lambda/statement-insight-${Stage}-startStateMachine' 19 | RestAPILogGroup: 20 | Type: 'AWS::Logs::LogGroup' 21 | Properties: 22 | LogGroupName: !Sub '/aws/lambda/statement-insight-${Stage}-restAPI' 23 | StartJobLogGroup: 24 | Type: 'AWS::Logs::LogGroup' 25 | Properties: 26 | LogGroupName: !Sub '/aws/lambda/statement-insight-${Stage}-startJob' 27 | GetResultsLogGroup: 28 | Type: 'AWS::Logs::LogGroup' 29 | Properties: 30 | LogGroupName: !Sub '/aws/lambda/statement-insight-${Stage}-getResults' 31 | IamRoleLambdaExecution: 32 | Type: 'AWS::IAM::Role' 33 | Properties: 34 | AssumeRolePolicyDocument: 35 | Version: 2012-10-17 36 | Statement: 37 | - Effect: Allow 38 | Principal: 39 | Service: 40 | - lambda.amazonaws.com 41 | Action: 42 | - 'sts:AssumeRole' 43 | Policies: 44 | - PolicyName: !Join 45 | - '-' 46 | - - statement-insight 47 | - lambda 48 | PolicyDocument: 49 | Version: 2012-10-17 50 | Statement: 51 | - Effect: Allow 52 | Action: 53 | - 'logs:CreateLogStream' 54 | - 'logs:CreateLogGroup' 55 | - 'logs:TagResource' 56 | Resource: 57 | - !Sub >- 58 | arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/statement-insight-${Stage}* 59 | - Effect: Allow 60 | Action: 61 | - 'logs:PutLogEvents' 62 | Resource: 63 | - !Sub >- 64 | arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/statement-insight-${Stage}* 65 | - Effect: Allow 66 | Action: 67 | - 'textract:StartDocumentTextDetection' 68 | - 'textract:StartDocumentAnalysis' 69 | - 'textract:GetDocumentTextDetection' 70 | - 'textract:GetDocumentAnalysis' 71 | Resource: 72 | - '*' 73 | - Effect: Allow 74 | Action: 75 | - 'states:StartExecution' 76 | Resource: 77 | - !Sub 'arn:aws:states:${AWS::Region}:${AWS::AccountId}:stateMachine:ProcessPDFStepFunction-${Stage}' 78 | - Effect: Allow 79 | Action: 80 | - 's3:PutObject' 81 | - 's3:GetObject' 82 | - 's3:ListBucket' 83 | Resource: 84 | - !Sub 'arn:aws:s3:::statement-insight-${BucketPostfix}' 85 | - !Sub 'arn:aws:s3:::statement-insight-${BucketPostfix}/*' 86 | - !Sub 'arn:aws:s3:::statement-insight-${BucketPostfix}-output' 87 | - !Sub 'arn:aws:s3:::statement-insight-${BucketPostfix}-output/*' 88 | Path: / 89 | RoleName: !Join 90 | - '-' 91 | - - statement-insight 92 | - !Sub '${Stage}' 93 | - !Ref 'AWS::Region' 94 | - lambdaRole 95 | StartStateMachineLambdaFunction: 96 | Type: 'AWS::Lambda::Function' 97 | Properties: 98 | Code: 99 | S3Bucket: !Sub '${DeploymentBucket}' 100 | S3Key: >- 101 | statement-insight.zip 102 | Handler: src/inputHandler.handle 103 | Runtime: python3.10 104 | FunctionName: !Sub 'statement-insight-${Stage}-startStateMachine' 105 | MemorySize: 1024 106 | Timeout: 600 107 | Environment: 108 | Variables: 109 | statemachine_arn: !Ref ProcessPDFStepFunction 110 | OUTPUT_BUCKET: !Sub 'statement-insight-${BucketPostfix}-output' 111 | OUTPUT_PREFIX: output 112 | Role: !GetAtt 113 | - IamRoleLambdaExecution 114 | - Arn 115 | DependsOn: 116 | - StartStateMachineLogGroup 117 | RestAPILambdaFunction: 118 | Type: 'AWS::Lambda::Function' 119 | Properties: 120 | Code: 121 | S3Bucket: !Sub '${DeploymentBucket}' 122 | S3Key: >- 123 | statement-insight.zip 124 | Handler: src/apiRequestHandler.handle 125 | Runtime: python3.10 126 | FunctionName: !Sub 'statement-insight-${Stage}-restAPI' 127 | MemorySize: 1024 128 | Timeout: 6 129 | Environment: 130 | Variables: 131 | DATA_BUCKET: !Sub 'statement-insight-${BucketPostfix}-output' 132 | Role: !GetAtt 133 | - IamRoleLambdaExecution 134 | - Arn 135 | DependsOn: 136 | - RestAPILogGroup 137 | StartJobLambdaFunction: 138 | Type: 'AWS::Lambda::Function' 139 | Properties: 140 | Code: 141 | S3Bucket: !Sub '${DeploymentBucket}' 142 | S3Key: >- 143 | statement-insight.zip 144 | Handler: src/startJob.handle 145 | Runtime: python3.10 146 | FunctionName: !Sub statement-insight-${Stage}-startJob 147 | MemorySize: 1024 148 | Timeout: 600 149 | Role: !GetAtt 150 | - IamRoleLambdaExecution 151 | - Arn 152 | DependsOn: 153 | - StartJobLogGroup 154 | GetResultsLambdaFunction: 155 | Type: 'AWS::Lambda::Function' 156 | Properties: 157 | Code: 158 | S3Bucket: !Sub '${DeploymentBucket}' 159 | S3Key: >- 160 | statement-insight.zip 161 | Handler: src/getResults.handle 162 | Runtime: python3.10 163 | FunctionName: !Sub 'statement-insight-${Stage}-getResults' 164 | MemorySize: 1024 165 | Timeout: 600 166 | Environment: 167 | Variables: 168 | OUTPUT_BUCKET: !Sub 'statement-insight-${BucketPostfix}-output' 169 | OUTPUT_PREFIX: output 170 | Role: !GetAtt 171 | - IamRoleLambdaExecution 172 | - Arn 173 | DependsOn: 174 | - GetResultsLogGroup 175 | ProcessPDFStepFunctionRole: 176 | Type: 'AWS::IAM::Role' 177 | Properties: 178 | AssumeRolePolicyDocument: 179 | Version: 2012-10-17 180 | Statement: 181 | - Effect: Allow 182 | Principal: 183 | Service: !Sub 'states.${AWS::Region}.amazonaws.com' 184 | Action: 'sts:AssumeRole' 185 | Policies: 186 | - PolicyName: !Sub '${Stage}-statement-insight-statemachine' 187 | PolicyDocument: 188 | Version: 2012-10-17 189 | Statement: 190 | - Effect: Allow 191 | Action: 192 | - 'lambda:InvokeFunction' 193 | Resource: 194 | - !Sub >- 195 | arn:aws:lambda:${AWS::Region}:${AWS::AccountId}:function:statement-insight-${Stage}-startJob 196 | - !Sub 197 | - '${functionArn}:*' 198 | - functionArn: !Sub >- 199 | arn:aws:lambda:${AWS::Region}:${AWS::AccountId}:function:statement-insight-${Stage}-startJob 200 | - !Sub >- 201 | arn:aws:lambda:${AWS::Region}:${AWS::AccountId}:function:statement-insight-${Stage}-getResults 202 | - !Sub 203 | - '${functionArn}:*' 204 | - functionArn: !Sub >- 205 | arn:aws:lambda:${AWS::Region}:${AWS::AccountId}:function:statement-insight-${Stage}-getResults 206 | ProcessPDFStepFunction: 207 | Type: 'AWS::StepFunctions::StateMachine' 208 | Properties: 209 | DefinitionString: !Sub 210 | - |- 211 | { 212 | "StartAt": "StartJob", 213 | "States": { 214 | "StartJob": { 215 | "Type": "Task", 216 | "Resource": "${startJobLambdaArn}", 217 | "Next": "Wait" 218 | }, 219 | "Wait": { 220 | "Type": "Wait", 221 | "Seconds": 5, 222 | "Next": "GetResults" 223 | }, 224 | "GetResults": { 225 | "Type": "Task", 226 | "Resource": "${getResultsLambdaArn}", 227 | "Next": "IsJobDone" 228 | }, 229 | "IsJobDone": { 230 | "Type": "Choice", 231 | "Choices": [ 232 | { 233 | "Variable": "$.job_status", 234 | "StringEquals": "IN_PROGRESS", 235 | "Next": "Wait" 236 | }, 237 | { 238 | "Variable": "$.job_status", 239 | "StringEquals": "SUCCEEDED", 240 | "Next": "Success" 241 | } 242 | ] 243 | }, 244 | "Success": { 245 | "Type": "Succeed" 246 | } 247 | } 248 | } 249 | - startJobLambdaArn: !Sub >- 250 | arn:aws:lambda:${AWS::Region}:${AWS::AccountId}:function:statement-insight-${Stage}-startJob 251 | getResultsLambdaArn: !Sub >- 252 | arn:aws:lambda:${AWS::Region}:${AWS::AccountId}:function:statement-insight-${Stage}-getResults 253 | RoleArn: !GetAtt 254 | - ProcessPDFStepFunctionRole 255 | - Arn 256 | StateMachineName: !Sub "ProcessPDFStepFunction-${Stage}" 257 | DependsOn: 258 | - ProcessPDFStepFunctionRole 259 | S3BucketStatementinsightOutput: 260 | Type: 'AWS::S3::Bucket' 261 | Properties: 262 | BucketName: !Sub "statement-insight-${BucketPostfix}-output" 263 | PublicAccessBlockConfiguration: 264 | BlockPublicAcls: true 265 | BlockPublicPolicy: true 266 | IgnorePublicAcls: true 267 | RestrictPublicBuckets: true 268 | S3BucketStatementinsight: 269 | Type: 'AWS::S3::Bucket' 270 | Properties: 271 | BucketName: !Sub "statement-insight-${BucketPostfix}" 272 | PublicAccessBlockConfiguration: 273 | BlockPublicAcls: true 274 | BlockPublicPolicy: true 275 | IgnorePublicAcls: true 276 | RestrictPublicBuckets: true 277 | NotificationConfiguration: 278 | LambdaConfigurations: 279 | - Event: 's3:ObjectCreated:*' 280 | Function: !GetAtt 281 | - StartStateMachineLambdaFunction 282 | - Arn 283 | DependsOn: 284 | - StartStateMachineLambdaPermissionStatementinsight 285 | StartStateMachineLambdaPermissionStatementinsight: 286 | Type: 'AWS::Lambda::Permission' 287 | Properties: 288 | FunctionName: !GetAtt 289 | - StartStateMachineLambdaFunction 290 | - Arn 291 | Action: 'lambda:InvokeFunction' 292 | Principal: s3.amazonaws.com 293 | SourceArn: !Join 294 | - '' 295 | - - 'arn:' 296 | - !Ref 'AWS::Partition' 297 | - !Sub ':s3:::statement-insight-${BucketPostfix}' 298 | SourceAccount: !Ref 'AWS::AccountId' 299 | ApiGatewayRestApi: 300 | Type: 'AWS::ApiGateway::RestApi' 301 | Properties: 302 | Name: !Sub "${Stage}-statement-insight" 303 | EndpointConfiguration: 304 | Types: 305 | - EDGE 306 | Policy: '' 307 | ApiGatewayMethodOptions: 308 | Type: 'AWS::ApiGateway::Method' 309 | Properties: 310 | AuthorizationType: NONE 311 | HttpMethod: OPTIONS 312 | MethodResponses: 313 | - StatusCode: '200' 314 | ResponseParameters: 315 | method.response.header.Access-Control-Allow-Origin: true 316 | method.response.header.Access-Control-Allow-Headers: true 317 | method.response.header.Access-Control-Allow-Methods: true 318 | ResponseModels: {} 319 | RequestParameters: {} 320 | Integration: 321 | Type: MOCK 322 | RequestTemplates: 323 | application/json: '{statusCode:200}' 324 | ContentHandling: CONVERT_TO_TEXT 325 | IntegrationResponses: 326 | - StatusCode: '200' 327 | ResponseParameters: 328 | method.response.header.Access-Control-Allow-Origin: '''*''' 329 | method.response.header.Access-Control-Allow-Headers: >- 330 | 'Content-Type,X-Amz-Date,Authorization,X-Api-Key,X-Amz-Security-Token,X-Amz-User-Agent,X-Amzn-Trace-Id' 331 | method.response.header.Access-Control-Allow-Methods: '''OPTIONS,DELETE,GET,HEAD,PATCH,POST,PUT''' 332 | ResponseTemplates: 333 | application/json: '' 334 | ResourceId: !GetAtt 335 | - ApiGatewayRestApi 336 | - RootResourceId 337 | RestApiId: !Ref ApiGatewayRestApi 338 | ApiGatewayMethodAny: 339 | Type: 'AWS::ApiGateway::Method' 340 | Properties: 341 | HttpMethod: ANY 342 | RequestParameters: {} 343 | ResourceId: !GetAtt 344 | - ApiGatewayRestApi 345 | - RootResourceId 346 | RestApiId: !Ref ApiGatewayRestApi 347 | ApiKeyRequired: true 348 | AuthorizationType: NONE 349 | Integration: 350 | IntegrationHttpMethod: POST 351 | Type: AWS_PROXY 352 | Uri: !Join 353 | - '' 354 | - - 'arn:' 355 | - !Ref 'AWS::Partition' 356 | - ':apigateway:' 357 | - !Ref 'AWS::Region' 358 | - ':lambda:path/2015-03-31/functions/' 359 | - !GetAtt 360 | - RestAPILambdaFunction 361 | - Arn 362 | - /invocations 363 | MethodResponses: [] 364 | DependsOn: 365 | - RestAPILambdaPermissionApiGateway 366 | ApiGatewayDeployment: 367 | Type: 'AWS::ApiGateway::Deployment' 368 | Properties: 369 | RestApiId: !Ref ApiGatewayRestApi 370 | StageName: !Sub "${Stage}" 371 | DependsOn: 372 | - ApiGatewayMethodOptions 373 | - ApiGatewayMethodAny 374 | ApiGatewayApiKey: 375 | Type: 'AWS::ApiGateway::ApiKey' 376 | Properties: 377 | Enabled: true 378 | Name: !Sub "statement-insight-key-${Stage}" 379 | StageKeys: 380 | - RestApiId: !Ref ApiGatewayRestApi 381 | StageName: !Sub "${Stage}" 382 | DependsOn: ApiGatewayDeployment 383 | ApiGatewayUsagePlan: 384 | Type: 'AWS::ApiGateway::UsagePlan' 385 | DependsOn: ApiGatewayDeployment 386 | Properties: 387 | ApiStages: 388 | - ApiId: !Ref ApiGatewayRestApi 389 | Stage: !Sub "${Stage}" 390 | Description: !Sub "Usage plan for statement-insight ${Stage} stage" 391 | UsagePlanName: !Sub "statement-insight-${Stage}" 392 | ApiGatewayUsagePlanKey: 393 | Type: 'AWS::ApiGateway::UsagePlanKey' 394 | Properties: 395 | KeyId: !Ref ApiGatewayApiKey 396 | KeyType: API_KEY 397 | UsagePlanId: !Ref ApiGatewayUsagePlan 398 | RestAPILambdaPermissionApiGateway: 399 | Type: 'AWS::Lambda::Permission' 400 | Properties: 401 | FunctionName: !GetAtt 402 | - RestAPILambdaFunction 403 | - Arn 404 | Action: 'lambda:InvokeFunction' 405 | Principal: apigateway.amazonaws.com 406 | SourceArn: !Join 407 | - '' 408 | - - 'arn:' 409 | - !Ref 'AWS::Partition' 410 | - ':execute-api:' 411 | - !Ref 'AWS::Region' 412 | - ':' 413 | - !Ref 'AWS::AccountId' 414 | - ':' 415 | - !Ref ApiGatewayRestApi 416 | - /*/* 417 | Outputs: 418 | LandingBucket: 419 | Description: The landing bucket to upload data to to be processed 420 | Value: !Sub "statement-insight-${BucketPostfix}" 421 | CLIApiKey: 422 | Description: CLI command to get the api key value. 423 | Value: !Sub "aws apigateway get-api-key --api-key ${ApiGatewayApiKey.APIKeyId} --include-value --query \"value\" --output text" 424 | ServiceEndpoint: 425 | Description: URL of the service endpoint 426 | Value: !Join 427 | - '' 428 | - - 'https://' 429 | - !Ref ApiGatewayRestApi 430 | - .execute-api. 431 | - !Ref 'AWS::Region' 432 | - . 433 | - !Ref 'AWS::URLSuffix' 434 | - !Sub "/${Stage}" 435 | 436 | 437 | --------------------------------------------------------------------------------