├── Statement20230428.pdf ├── pipeline_architecture.png ├── CODE_OF_CONDUCT.md ├── classification.csv ├── src ├── cbaSavingsRowProcessor.py ├── rowProcessor.py ├── startJob.py ├── inputHandler.py ├── cbaCCRowProcessor.py ├── apiRequestHandler.py └── getResults.py ├── LICENSE ├── README.md ├── CONTRIBUTING.md ├── frontend └── index.html └── cloudFormation.yml /Statement20230428.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/textract-bank-statement-processor/HEAD/Statement20230428.pdf -------------------------------------------------------------------------------- /pipeline_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/textract-bank-statement-processor/HEAD/pipeline_architecture.png -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. -------------------------------------------------------------------------------- /classification.csv: -------------------------------------------------------------------------------- 1 | key,type,subtype 2 | paypal,Online,Paypal 3 | subway,Entertainment,Eating out 4 | noodle & sushi,Entertainment,Eating out 5 | vic roads,Transport,Car 6 | citylink,Transport,Toll 7 | pizza,Entertainment,Eating out 8 | wage,Employment,Acme Corporation 9 | bp,Transport,Petrol 10 | red rooster,Entertainment,Eating out 11 | 7-eleven,Groceries,7-Eleven 12 | belmont wines,Entertainment,Liquor 13 | beatie service,Transport,Car 14 | kfc,Entertainment,Eating out 15 | telstra,Utilities,Mobile 16 | wdl,Cash,ATM 17 | safeway,Groceries,Safeway 18 | picture box,Home,Artwork 19 | salvos,Clothing,Salvos 20 | cignall,Entertainment,Tobacco 21 | sw petrol,Transport,Petrol 22 | service centre,Transport,Car 23 | coles,Groceries,Coles 24 | kmart,Home,KMart 25 | bunnings,Home,Bunnings 26 | fee,Fees,Transaction fee 27 | transfer,Transfer,Internal Transfer 28 | -------------------------------------------------------------------------------- /src/cbaSavingsRowProcessor.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | from src.rowProcessor import TableRowProcessor 5 | 6 | class CBASavingsRowProcessor(TableRowProcessor): 7 | 8 | def process_row(self, row, statement_type, statement_name): 9 | print("Processing a CBA savings row") 10 | print(row) 11 | row['Date'] = super().get_date(statement_type, statement_name, row['Date']) 12 | if row['Credit']: 13 | row['Credit'] = super().convert_str_to_float(row['Credit']) 14 | row['Debit'] = 0 15 | if row['Debit']: 16 | row['Debit'] = super().convert_str_to_float(row['Debit']) 17 | row['Credit'] = 0 18 | del row['Balance'] 19 | row_no_blank_keys = {k: v for k, v in row.items() if k} 20 | print(row_no_blank_keys) 21 | return row_no_blank_keys 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | software and associated documentation files (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, copy, modify, 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /src/rowProcessor.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | from abc import ABC, abstractmethod 5 | from re import sub 6 | 7 | class TableRowProcessor(ABC): 8 | 9 | @abstractmethod 10 | def process_row(self, row, statement_type, statement_name): 11 | """Processes a row in a table extracted 12 | by Textract, into an object in the standard 13 | transaction format 14 | """ 15 | pass 16 | 17 | def get_date(self, statement_type, filename, date): 18 | year = int(filename[9:13]) 19 | if (statement_type == 'cba_bank'): 20 | mon = date[3:6] 21 | mon_last_year = ['Oct', 'Nov', 'Dec'] 22 | if mon in mon_last_year: 23 | year = year - 1 24 | return date + ' ' + str(year) 25 | elif (statement_type == 'cba_cc'): 26 | year = int(filename[9:13]) 27 | return date + ' ' + str(year) 28 | else: 29 | return date 30 | 31 | def convert_str_to_float(self, value): 32 | try: 33 | return float(sub(r'[^\d.]', '', value)) 34 | except Exception as e: 35 | print(e) 36 | return 0.0 37 | -------------------------------------------------------------------------------- /src/startJob.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | import json 5 | import time 6 | import boto3 7 | import sys 8 | 9 | textract_client = boto3.client('textract') 10 | s3_client = boto3.client('s3') 11 | 12 | def startJob(bucket_name, object_name): 13 | response = textract_client.start_document_analysis( 14 | DocumentLocation={ 15 | 'S3Object': { 16 | 'Bucket': bucket_name, 17 | 'Name': object_name, 18 | } 19 | }, 20 | FeatureTypes=['TABLES'], 21 | ) 22 | 23 | return response["JobId"] 24 | 25 | def handle(event, context): 26 | if event['eventSource'] != 'aws:s3': 27 | print("ERROR: Unexpected event type") 28 | print(json.dumps(event)) 29 | raise ValueError("ERROR: Unexpected event type") 30 | 31 | bucket_name = event['s3']['bucket']['name'] 32 | key = event['s3']['object']['key'] 33 | 34 | response = s3_client.head_object(Bucket=bucket_name, Key=key) 35 | 36 | print('Response: {}'.format(response)) 37 | 38 | print(f"StartJob: s3://{bucket_name}/{key}") 39 | statement_type = response['Metadata']['statement_type'] 40 | 41 | job_id = startJob(bucket_name, key) 42 | print(f"JobId: {job_id}") 43 | 44 | return { 45 | "bucket_name": bucket_name, 46 | "object_name": key, 47 | "job_id": job_id, 48 | "job_start_timestamp": time.time(), 49 | "statement_type": statement_type, 50 | } 51 | 52 | if __name__ == "__main__": 53 | import sys 54 | with open(sys.argv[1], "rt") as f: 55 | event = json.load(f) 56 | ret = handler(event, {}) 57 | print(json.dumps(ret, indent=2)) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # textract-statement-processor 2 | 3 | A sample pipeline that takes as input bank statements, extracts transaction information from 4 | tables within the statements using Textract, stores, and classifies each transaction. 5 | 6 | 7 |  8 | 9 | 1. PDF bank statements that have been scanned, or downloaded from an online banking application, are uploaded to the Landing bucket in S3 10 | 2. The landing of the file in the S3 bucket triggers a Lambda function that starts the step function 11 | 3. The Lambda function starts the step function execution 12 | 4. The first step in the step function calls a Lambda to start a new Textract document analysis job 13 | 5. A new document analysis job is invoked with the uploaded PDF 14 | 6. The step function periodically calls a Lambda to get the job results 15 | 7. The Lambda checks with Textract, using the job identifier, whether the analysis job is complete 16 | 8. When the analysis job is complete the Lambda takes the output of the job, extracts the tabular data, and processes the transaction records into a JSON file which it then saves in the Processed bucket in S3. 17 | 9. An API Lambda queries the JSON files stored in the S3 bucket in response to a request from the API gateway. An additional classification step at this point classifies each transaction into a type and sub-type based on user configurable classification rules. 18 | 10. The API Gateway serves a RESTful API that a we frontend consumes to visualise transaction data 19 | 11. Finally, the visualisation output of multiple years worth of classified transaction data is visualised within a Sankey diagram as shown below, allowing users to see at a glance income vs expenditure. 20 | 12. ML models can be trained and run aginst historical transaction data 21 | -------------------------------------------------------------------------------- /src/inputHandler.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | import boto3 5 | import os 6 | import random 7 | import json 8 | import csv 9 | from re import sub 10 | 11 | client = boto3.client('stepfunctions') 12 | s3_client = boto3.client('s3') 13 | 14 | def convert_str_to_float(value): 15 | try: 16 | return float(sub(r'[^\d.]', '', value)) 17 | except Exception as e: 18 | print(e) 19 | return 0.0 20 | 21 | def handle(event, context): 22 | print(event) 23 | bucket_name = event['Records'][0]['s3']['bucket']['name'] 24 | key = event['Records'][0]['s3']['object']['key'] 25 | response = s3_client.head_object(Bucket=bucket_name, Key=key) 26 | print('Response: {}'.format(response)) 27 | statement_type = response['Metadata']['statement_type'] 28 | if statement_type == 'ing': 29 | handle_csv_statement(bucket_name, key) 30 | elif statement_type == 'cba_cc' or statement_type == 'cba_bank': 31 | stateMachineARN = os.environ['statemachine_arn'] 32 | response = client.start_execution( 33 | stateMachineArn=stateMachineARN, 34 | name='test-sf'+str(random.randint(10, 100000)), 35 | input=json.dumps(event['Records'][0]) 36 | ) 37 | 38 | def handle_csv_statement(bucket_name, key): 39 | print('Handle CSV statement') 40 | response = s3_client.get_object(Bucket=bucket_name, Key=key) 41 | contents = response['Body'].read().decode('utf-8').splitlines() 42 | 43 | result = [] 44 | csvReader = csv.DictReader(contents) 45 | for row in csvReader: 46 | result.append(row) 47 | 48 | for transaction in result: 49 | transaction['Debit'] = convert_str_to_float(transaction['Debit'].lstrip('-')) 50 | transaction['Credit'] = convert_str_to_float(transaction['Credit']) 51 | 52 | output_bucket = os.environ['OUTPUT_BUCKET'] 53 | output_prefix = os.environ['OUTPUT_PREFIX'] 54 | 55 | output_object_base = os.path.join(output_prefix, os.path.abspath(key)) 56 | 57 | output_object = f"{output_object_base}.json" 58 | s3_client.put_object( 59 | Bucket=output_bucket, 60 | Key=output_object, 61 | Body=(bytes(json.dumps(result).encode('UTF-8'))), 62 | ServerSideEncryption='AES256', 63 | ContentType='application/json', 64 | ) 65 | print(f"File saved to: s3://{output_bucket}/{output_object}") 66 | 67 | -------------------------------------------------------------------------------- /src/cbaCCRowProcessor.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | from src.rowProcessor import TableRowProcessor 5 | 6 | class CBACCRowProcessor(TableRowProcessor): 7 | 8 | def process_row(self, row, statement_type, statement_name): 9 | print(row) 10 | print("Processing a CBA CC row") 11 | row['Date'] = super().get_date(statement_type, statement_name, row['Date']) 12 | if 'Transaction Details' in row: 13 | row['Transaction'] = row['Transaction Details'] 14 | del row['Transaction Details'] 15 | if row['Amount (A$)'].endswith('-'): 16 | credit = row['Amount (A$)'][:-1] 17 | row['Credit'] = super().convert_str_to_float(credit) 18 | row['Debit'] = 0 19 | del row['Amount (A$)'] 20 | else: 21 | row['Debit'] = super().convert_str_to_float(row['Amount (A$)']) 22 | row['Credit'] = 0 23 | del row['Amount (A$)'] 24 | if 'Transaction details' in row: 25 | row['Transaction'] = row['Transaction details'] 26 | del row['Transaction details'] 27 | if row['Amount (A$)'].endswith('-'): 28 | credit = row['Amount (A$)'][:-1] 29 | row['Credit'] = super().convert_str_to_float(credit) 30 | row['Debit'] = 0 31 | del row['Amount (A$)'] 32 | else: 33 | row['Debit'] = super().convert_str_to_float(row['Amount (A$)']) 34 | row['Credit'] = 0 35 | del row['Amount (A$)'] 36 | if 'Transaction details Amount (A$)' in row: 37 | tokens = row['Transaction details Amount (A$)'].split() 38 | amount = tokens[-1] 39 | row['Debit'] = super().convert_str_to_float(amount) 40 | row['Credit'] = 0 41 | row['Transaction'] = row['Transaction details Amount (A$)'] 42 | del row['Transaction details Amount (A$)'] 43 | if 'Transaction Details Amount (A$)' in row: 44 | tokens = row['Transaction Details Amount (A$)'].split() 45 | amount = tokens[-1] 46 | row['Debit'] = super().convert_str_to_float(amount) 47 | row['Credit'] = 0 48 | row['Transaction'] = row['Transaction Details Amount (A$)'] 49 | del row['Transaction Details Amount (A$)'] 50 | 51 | row_no_blank_keys = {k: v for k, v in row.items() if k} 52 | print(row_no_blank_keys) 53 | return row_no_blank_keys 54 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. -------------------------------------------------------------------------------- /src/apiRequestHandler.py: -------------------------------------------------------------------------------- 1 | """Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | SPDX-License-Identifier: MIT-0 3 | """ 4 | import json 5 | import csv 6 | import boto3 7 | import os 8 | import re 9 | 10 | s3 = boto3.client('s3') 11 | bucket_name = os.environ['DATA_BUCKET'] 12 | 13 | classifications = [] 14 | with open ("classification.csv") as classification_csv: 15 | csv_reader = csv.DictReader(classification_csv) 16 | for row in csv_reader: 17 | classifications.append(row) 18 | 19 | def handle(event, context): 20 | objects = s3.list_objects_v2(Bucket=bucket_name) 21 | 22 | json_contents = [] 23 | for obj in objects['Contents']: 24 | key = obj['Key'] 25 | print(key) 26 | if key.endswith('.json'): # Only process JSON files 27 | response = s3.get_object(Bucket=bucket_name, Key=key) 28 | contents = response['Body'].read().decode('utf-8') 29 | print(contents) 30 | json_contents.extend(json.loads(contents)) 31 | 32 | type_total_dict = {} 33 | for transaction in json_contents: 34 | if 'Transaction' in transaction: 35 | classification = classify_transaction(transaction['Transaction']) 36 | type = classification['type'] 37 | sub_type = classification['subtype'] 38 | transaction['Type'] = type 39 | transaction['Subtype'] = sub_type 40 | 41 | try: 42 | if 'Credit' in transaction and transaction['Credit'] != "": 43 | if type in type_total_dict: 44 | if sub_type in type_total_dict[type]: 45 | type_total_dict[type][sub_type] = float("{:.2f}".format(type_total_dict[type][sub_type])) + get_amount(transaction['Credit']) 46 | else: 47 | type_total_dict[type][sub_type] = get_amount(transaction['Credit']) 48 | else: 49 | type_total_dict[type] = {} 50 | type_total_dict[type][sub_type] = get_amount(transaction['Credit']) 51 | if 'Debit' in transaction and transaction['Debit'] != "": 52 | if type in type_total_dict: 53 | if sub_type in type_total_dict[type]: 54 | type_total_dict[type][sub_type] = float("{:.2f}".format(type_total_dict[type][sub_type])) + get_amount(transaction['Debit']) 55 | else: 56 | type_total_dict[type][sub_type] = get_amount(transaction['Debit']) 57 | else: 58 | type_total_dict[type] = {} 59 | type_total_dict[type][sub_type] = get_amount(transaction['Debit']) 60 | except ValueError as ve: 61 | print(ve) 62 | 63 | print(type_total_dict) 64 | print(json.dumps(type_total_dict, indent = 4)) 65 | 66 | result = {'summary': type_total_dict, 'transactions': json_contents} 67 | 68 | return { 69 | 'statusCode': 200, 70 | 'headers': { 71 | 'Access-Control-Allow-Origin': '*' 72 | }, 73 | 'body': json.dumps(result) 74 | } 75 | 76 | def get_amount(amount_str): 77 | non_decimal = re.compile(r'[^\d.]+') 78 | return float("{:.2f}".format(float(non_decimal.sub('',str(amount_str)).replace(",", "")))) 79 | 80 | 81 | def classify_transaction(description): 82 | for classification in classifications: 83 | if classification['key'] in description.lower(): 84 | return classification 85 | return {'key': '', 'type': 'Unknown', 'subtype': 'Unclassified'} -------------------------------------------------------------------------------- /frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 || Date | 25 |Description | 26 |Type | 27 |Subtype | 28 |Debit | 29 |Credit | 30 |
|---|