├── .DS_Store ├── .eslintrc.json ├── .flake8 ├── .gitattributes ├── .github ├── pull_request_template.md └── workflows │ ├── build.yml │ ├── pull-request-lint.yml │ ├── release.yml │ └── upgrade-main.yml ├── .gitignore ├── .mergify.yml ├── .npmignore ├── .projen ├── deps.json ├── files.json └── tasks.json ├── .projenrc.js ├── .vscode └── settings.json ├── API.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Config ├── LICENSE ├── README.md ├── THIRD-PARTY-LICENSES.txt ├── amazon-textract-idp-cdk-stack-samples.code-workspace ├── images ├── Workmail_Lambda.png ├── email_rule_1.png ├── email_rule_2.png └── email_rule_3.png ├── lambda ├── .DS_Store ├── a2i_postprocess │ ├── Dockerfile │ ├── app │ │ ├── main.py │ │ ├── requirements.txt │ │ └── test.sh │ ├── env.json │ ├── events │ │ ├── event.json │ │ └── simple-event.json │ ├── template.yaml │ └── test_sam_local.sh ├── a2i_preprocess │ ├── Dockerfile │ ├── app │ │ ├── main.py │ │ └── requirements.txt │ ├── env.json │ ├── events │ │ ├── event.json │ │ └── simple-event.json │ ├── template.yaml │ └── test_sam_local.sh ├── async_to_json │ ├── Dockerfile │ ├── app │ │ └── main.py │ ├── env.json │ ├── events │ │ └── event.json │ ├── template.yaml │ └── test_sam_local.sh ├── cfn_custom_configurator_prefill │ ├── Dockerfile │ ├── app │ │ ├── default_config.csv │ │ ├── entry.sh │ │ ├── generate_csv.py │ │ ├── generated_manifest │ │ ├── main.py │ │ └── requirements.txt │ ├── env.json │ ├── events │ │ ├── event.json │ │ └── simple-event.json │ ├── template.yaml │ └── test_sam_local.sh ├── classification_spacy │ ├── Dockerfile │ ├── app │ │ ├── entry.sh │ │ ├── requirements.txt │ │ └── sync_main.py │ ├── en_textcat_demo-0.0.0.tar.gz │ ├── env.json │ ├── events │ │ ├── event.json │ │ └── simple-event.json │ ├── template.yaml │ └── test_sam_local.sh ├── classification_spacy_image │ ├── Dockerfile │ ├── env.json │ ├── events │ │ ├── event.json │ │ └── simple-event.json │ ├── template.yaml │ └── test_sam_local.sh ├── comprehend_sync │ ├── Dockerfile │ ├── app │ │ ├── entry.sh │ │ ├── requirements.txt │ │ └── sync_main.py │ ├── env.json │ ├── events │ │ ├── event.json │ │ └── simple-event.json │ ├── template.yaml │ └── test_sam_local.sh ├── configurator │ ├── Dockerfile │ ├── app │ │ └── main.py │ ├── env.json │ ├── events │ │ └── event.json │ ├── template.yaml │ └── test_sam_local.sh ├── csv_to_aurora │ ├── Dockerfile │ ├── app │ │ ├── entry.sh │ │ ├── main.py │ │ └── requirements.txt │ ├── env.json │ ├── events │ │ └── event.json │ ├── template.yaml │ └── test_sam_local.sh ├── decider │ ├── Dockerfile │ ├── app │ │ ├── decider_main.py │ │ ├── entry.sh │ │ └── requirements.txt │ ├── env.json │ ├── events │ │ └── event.json │ ├── template.yaml │ ├── test_sam_local.sh │ └── tests │ │ ├── data │ │ ├── sample_manifest.json │ │ └── simple_feature_manifest.json │ │ └── test_decider.py ├── document_splitter │ ├── Dockerfile │ ├── app │ │ ├── documentsplitter │ │ │ ├── __init__.py │ │ │ └── documentsplitter.py │ │ └── main.py │ ├── env.json │ ├── events │ │ └── event.json │ ├── template.yaml │ ├── test_sam_local.sh │ └── tests │ │ ├── data │ │ └── sample_210_page_pdf.json │ │ └── test_document_splitter.py ├── executions_queue_worker │ ├── Dockerfile │ └── app │ │ └── main.py ├── executions_start_throttle │ ├── Dockerfile │ └── app │ │ └── start_execution.py ├── executions_throttle_counter_reset │ ├── Dockerfile │ └── app │ │ └── main.py ├── generatecsv │ ├── Dockerfile │ ├── app │ │ └── main.py │ ├── envs │ │ ├── env-bak.json │ │ ├── env-meta-data-lending.json │ │ ├── env-no-meta-lending.json │ │ ├── env.json │ │ └── linearizer.json │ ├── events │ │ ├── event-linearizer.json │ │ ├── event-meta-lending.json │ │ ├── event.json │ │ └── generate_csv_tables.json │ ├── template.yaml │ └── test_sam_local.sh ├── pdf_mapper_for_fhir │ ├── Dockerfile │ ├── app │ │ ├── __init__.py │ │ ├── entry.sh │ │ ├── fhir_doc_assembler.py │ │ ├── main.py │ │ ├── requirements.txt │ │ ├── send_to_healthlake.py │ │ └── trp.py │ ├── env.json │ ├── events │ │ └── event.json │ ├── template.yaml │ ├── test_sam_local.sh │ └── tests │ │ ├── data │ │ ├── sample_manifest.json │ │ └── simple_feature_manifest.json │ │ └── test_pdf_mapper_for_fhir.py ├── put_on_sqs │ ├── Dockerfile │ ├── app │ │ ├── entry.sh │ │ ├── main.py │ │ └── requirements.txt │ ├── env.json │ ├── events │ │ ├── event.json │ │ └── simple-event.json │ ├── template.yaml │ ├── test_sam_local.sh │ └── tests │ │ └── data │ │ ├── sample_manifest.json │ │ └── simple_feature_manifest.json ├── rds_serverless_init │ ├── Dockerfile │ ├── app │ │ ├── entry.sh │ │ ├── main.py │ │ └── requirements.txt │ ├── env.json │ ├── events │ │ ├── event.json │ │ └── simple-event.json │ ├── template.yaml │ └── test_sam_local.sh ├── searchablePDF │ ├── Dockerfile │ ├── app │ │ ├── entry.sh │ │ └── main.py │ ├── events │ │ └── event.json │ ├── template.yaml │ └── test_sam_local.sh ├── textract_async │ ├── Dockerfile │ ├── app │ │ ├── entry.sh │ │ ├── main.py │ │ └── requirements.txt │ ├── env.json │ ├── events │ │ └── event.json │ ├── template.yaml │ └── test_sam_local.sh ├── textract_async_sns_listener │ ├── Dockerfile │ └── app │ │ ├── entry.sh │ │ ├── main.py │ │ └── requirements.txt ├── textract_comprehend_medical │ ├── Dockerfile │ ├── app │ │ ├── __init__.py │ │ ├── entry.sh │ │ ├── main.py │ │ ├── requirements.txt │ │ └── trp.py │ ├── env.json │ ├── events │ │ └── event.json │ ├── template.yaml │ ├── test_sam_local.sh │ └── tests │ │ ├── data │ │ ├── sample_manifest.json │ │ └── simple_feature_manifest.json │ │ └── test_pdf_mapper_for_fhir.py ├── textract_sync │ ├── Dockerfile │ ├── app │ │ ├── entry.sh │ │ ├── requirements.txt │ │ └── sync_main.py │ ├── env.json │ ├── events │ │ ├── event.json │ │ └── simple-event.json │ ├── template.yaml │ └── test_sam_local.sh └── workmail_s3 │ ├── Dockerfile │ └── app │ ├── main.py │ └── requirements.txt ├── package.json ├── src ├── cfnCustomResourceConfiguratorPrefill.ts ├── comprehendClassification.ts ├── documentSplitter.ts ├── index.ts ├── rdsAuroraServerless.ts ├── rdsCSVToAurora.ts ├── rdsServerlessInit.ts ├── searchablePDF.ts ├── spacyClassification.ts ├── stepFunctionsExecutionsStartThrottle.ts ├── textractA2I.ts ├── textractClassificationConfigurator.ts ├── textractComprehendMedical.ts ├── textractDecider.ts ├── textractGenerateCSV.ts ├── textractOutputConfigToJSON.ts ├── textractPdfMapperForFhir.ts ├── textractSync.ts └── workmailS3IngestionPoint.ts ├── test ├── comprehendClassification.test.ts ├── rdsAuroraServerless.test.ts ├── rdsCSVToAurora.test.ts ├── spacyClassification.test.ts ├── test_csv_generator.py ├── textractA2I.test.ts ├── textractAsync.test.ts ├── textractClassificationConfigurator.test.ts ├── textractDecider.test.ts ├── textractGenerateCsv.test.ts ├── textractOutputConfigToJSON.test.ts ├── textractPdfMapperForFhir.test.ts ├── textractSync.test.ts └── workmailS3IngestionPoint.test.ts ├── tsconfig.dev.json ├── update_dependencies_local.sh └── yarn.lock /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/.DS_Store -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501,W503 3 | max-line-length = 120 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # ~~ Generated by projen. To modify, edit .projenrc.js and run "npx projen". 2 | 3 | *.snap linguist-generated 4 | /.eslintrc.json linguist-generated 5 | /.gitattributes linguist-generated 6 | /.github/pull_request_template.md linguist-generated 7 | /.github/workflows/build.yml linguist-generated 8 | /.github/workflows/pull-request-lint.yml linguist-generated 9 | /.github/workflows/release.yml linguist-generated 10 | /.github/workflows/upgrade-main.yml linguist-generated 11 | /.gitignore linguist-generated 12 | /.mergify.yml linguist-generated 13 | /.npmignore linguist-generated 14 | /.projen/** linguist-generated 15 | /.projen/deps.json linguist-generated 16 | /.projen/files.json linguist-generated 17 | /.projen/tasks.json linguist-generated 18 | /API.md linguist-generated 19 | /LICENSE linguist-generated 20 | /package.json linguist-generated 21 | /tsconfig.dev.json linguist-generated 22 | /yarn.lock linguist-generated -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | Fixes # -------------------------------------------------------------------------------- /.github/workflows/pull-request-lint.yml: -------------------------------------------------------------------------------- 1 | # ~~ Generated by projen. To modify, edit .projenrc.js and run "npx projen". 2 | 3 | name: pull-request-lint 4 | on: 5 | pull_request_target: 6 | types: 7 | - labeled 8 | - opened 9 | - synchronize 10 | - reopened 11 | - ready_for_review 12 | - edited 13 | jobs: 14 | validate: 15 | name: Validate PR title 16 | runs-on: ubuntu-latest 17 | permissions: 18 | pull-requests: write 19 | steps: 20 | - uses: amannn/action-semantic-pull-request@v5.4.0 21 | env: 22 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 23 | with: 24 | types: |- 25 | feat 26 | fix 27 | chore 28 | requireScope: false 29 | -------------------------------------------------------------------------------- /.github/workflows/upgrade-main.yml: -------------------------------------------------------------------------------- 1 | # ~~ Generated by projen. To modify, edit .projenrc.js and run "npx projen". 2 | 3 | name: upgrade-main 4 | on: 5 | workflow_dispatch: {} 6 | schedule: 7 | - cron: 0 0 * * * 8 | jobs: 9 | upgrade: 10 | name: Upgrade 11 | runs-on: ubuntu-latest 12 | permissions: 13 | contents: read 14 | outputs: 15 | patch_created: ${{ steps.create_patch.outputs.patch_created }} 16 | steps: 17 | - name: Checkout 18 | uses: actions/checkout@v4 19 | with: 20 | ref: main 21 | - name: Setup Node.js 22 | uses: actions/setup-node@v4 23 | with: 24 | node-version: 18.x 25 | - name: Install dependencies 26 | run: yarn install --check-files --frozen-lockfile 27 | - name: Upgrade dependencies 28 | run: npx projen upgrade 29 | - name: Find mutations 30 | id: create_patch 31 | run: |- 32 | git add . 33 | git diff --staged --patch --exit-code > .repo.patch || echo "patch_created=true" >> $GITHUB_OUTPUT 34 | working-directory: ./ 35 | - name: Upload patch 36 | if: steps.create_patch.outputs.patch_created 37 | uses: actions/upload-artifact@v4 38 | with: 39 | name: .repo.patch 40 | path: .repo.patch 41 | overwrite: true 42 | pr: 43 | name: Create Pull Request 44 | needs: upgrade 45 | runs-on: ubuntu-latest 46 | permissions: 47 | contents: read 48 | if: ${{ needs.upgrade.outputs.patch_created }} 49 | steps: 50 | - name: Checkout 51 | uses: actions/checkout@v4 52 | with: 53 | ref: main 54 | - name: Download patch 55 | uses: actions/download-artifact@v4 56 | with: 57 | name: .repo.patch 58 | path: ${{ runner.temp }} 59 | - name: Apply patch 60 | run: '[ -s ${{ runner.temp }}/.repo.patch ] && git apply ${{ runner.temp }}/.repo.patch || echo "Empty patch. Skipping."' 61 | - name: Set git identity 62 | run: |- 63 | git config user.name "github-actions" 64 | git config user.email "github-actions@github.com" 65 | - name: Create Pull Request 66 | id: create-pr 67 | uses: peter-evans/create-pull-request@v6 68 | with: 69 | token: ${{ secrets.PROJEN_GITHUB_TOKEN }} 70 | commit-message: |- 71 | chore(deps): upgrade dependencies 72 | 73 | Upgrades project dependencies. See details in [workflow run]. 74 | 75 | [Workflow Run]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} 76 | 77 | ------ 78 | 79 | *Automatically created by projen via the "upgrade-main" workflow* 80 | branch: github-actions/upgrade-main 81 | title: "chore(deps): upgrade dependencies" 82 | body: |- 83 | Upgrades project dependencies. See details in [workflow run]. 84 | 85 | [Workflow Run]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} 86 | 87 | ------ 88 | 89 | *Automatically created by projen via the "upgrade-main" workflow* 90 | author: github-actions 91 | committer: github-actions 92 | signoff: true 93 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ~~ Generated by projen. To modify, edit .projenrc.js and run "npx projen". 2 | !/.gitattributes 3 | !/.projen/tasks.json 4 | !/.projen/deps.json 5 | !/.projen/files.json 6 | !/.github/workflows/pull-request-lint.yml 7 | !/package.json 8 | !/LICENSE 9 | !/.npmignore 10 | logs 11 | *.log 12 | npm-debug.log* 13 | yarn-debug.log* 14 | yarn-error.log* 15 | lerna-debug.log* 16 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 17 | pids 18 | *.pid 19 | *.seed 20 | *.pid.lock 21 | lib-cov 22 | coverage 23 | *.lcov 24 | .nyc_output 25 | build/Release 26 | node_modules/ 27 | jspm_packages/ 28 | *.tsbuildinfo 29 | .eslintcache 30 | *.tgz 31 | .yarn-integrity 32 | .cache 33 | test/__snapshots__/ 34 | __pycache__ 35 | .python-version 36 | .aws-sam 37 | .nvmrc 38 | .vscode 39 | update_dependencies_local.sh 40 | .idea 41 | /test-reports/ 42 | junit.xml 43 | /coverage/ 44 | !/.github/workflows/build.yml 45 | /dist/changelog.md 46 | /dist/version.txt 47 | !/.github/workflows/release.yml 48 | !/.mergify.yml 49 | !/.github/workflows/upgrade-main.yml 50 | !/.github/pull_request_template.md 51 | !/test/ 52 | !/tsconfig.dev.json 53 | !/src/ 54 | /lib 55 | /dist/ 56 | !/.eslintrc.json 57 | .jsii 58 | tsconfig.json 59 | !/API.md 60 | !/.projenrc.js 61 | -------------------------------------------------------------------------------- /.mergify.yml: -------------------------------------------------------------------------------- 1 | # ~~ Generated by projen. To modify, edit .projenrc.js and run "npx projen". 2 | 3 | queue_rules: 4 | - name: default 5 | update_method: merge 6 | conditions: 7 | - "#approved-reviews-by>=1" 8 | - -label~=(do-not-merge) 9 | - status-success=build 10 | - status-success=package-js 11 | - status-success=package-java 12 | - status-success=package-python 13 | pull_request_rules: 14 | - name: Automatic merge on approval and successful build 15 | actions: 16 | delete_head_branch: {} 17 | queue: 18 | method: squash 19 | name: default 20 | commit_message_template: |- 21 | {{ title }} (#{{ number }}) 22 | 23 | {{ body }} 24 | conditions: 25 | - "#approved-reviews-by>=1" 26 | - -label~=(do-not-merge) 27 | - status-success=build 28 | - status-success=package-js 29 | - status-success=package-java 30 | - status-success=package-python 31 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | # ~~ Generated by projen. To modify, edit .projenrc.js and run "npx projen". 2 | /.projen/ 3 | /test-reports/ 4 | junit.xml 5 | /coverage/ 6 | permissions-backup.acl 7 | /dist/changelog.md 8 | /dist/version.txt 9 | /.mergify.yml 10 | /test/ 11 | /tsconfig.dev.json 12 | /src/ 13 | !/lib/ 14 | !/lib/**/*.js 15 | !/lib/**/*.d.ts 16 | dist 17 | /tsconfig.json 18 | /.github/ 19 | /.vscode/ 20 | /.idea/ 21 | /.projenrc.js 22 | tsconfig.tsbuildinfo 23 | /.eslintrc.json 24 | !.jsii 25 | /.gitattributes 26 | -------------------------------------------------------------------------------- /.projen/deps.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [ 3 | { 4 | "name": "@types/jest", 5 | "type": "build" 6 | }, 7 | { 8 | "name": "@types/node", 9 | "version": "^18", 10 | "type": "build" 11 | }, 12 | { 13 | "name": "@typescript-eslint/eslint-plugin", 14 | "version": "^6", 15 | "type": "build" 16 | }, 17 | { 18 | "name": "@typescript-eslint/parser", 19 | "version": "^6", 20 | "type": "build" 21 | }, 22 | { 23 | "name": "aws-cdk-lib", 24 | "version": "^2.135.0", 25 | "type": "build" 26 | }, 27 | { 28 | "name": "eslint-import-resolver-typescript", 29 | "type": "build" 30 | }, 31 | { 32 | "name": "eslint-plugin-import", 33 | "type": "build" 34 | }, 35 | { 36 | "name": "eslint", 37 | "version": "^8", 38 | "type": "build" 39 | }, 40 | { 41 | "name": "jest", 42 | "type": "build" 43 | }, 44 | { 45 | "name": "jest-junit", 46 | "version": "^15", 47 | "type": "build" 48 | }, 49 | { 50 | "name": "jsii-diff", 51 | "type": "build" 52 | }, 53 | { 54 | "name": "jsii-docgen", 55 | "type": "build" 56 | }, 57 | { 58 | "name": "jsii-pacmak", 59 | "type": "build" 60 | }, 61 | { 62 | "name": "jsii-rosetta", 63 | "version": "~5.3.0", 64 | "type": "build" 65 | }, 66 | { 67 | "name": "jsii", 68 | "version": "~5.3.0", 69 | "type": "build" 70 | }, 71 | { 72 | "name": "projen", 73 | "type": "build" 74 | }, 75 | { 76 | "name": "standard-version", 77 | "version": "^9", 78 | "type": "build" 79 | }, 80 | { 81 | "name": "ts-jest", 82 | "type": "build" 83 | }, 84 | { 85 | "name": "typescript", 86 | "type": "build" 87 | }, 88 | { 89 | "name": "aws-cdk-lib", 90 | "version": "^2.135.0", 91 | "type": "peer" 92 | }, 93 | { 94 | "name": "constructs", 95 | "version": "^10.0.5", 96 | "type": "peer" 97 | } 98 | ], 99 | "//": "~~ Generated by projen. To modify, edit .projenrc.js and run \"npx projen\"." 100 | } 101 | -------------------------------------------------------------------------------- /.projen/files.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | ".eslintrc.json", 4 | ".gitattributes", 5 | ".github/pull_request_template.md", 6 | ".github/workflows/build.yml", 7 | ".github/workflows/pull-request-lint.yml", 8 | ".github/workflows/release.yml", 9 | ".github/workflows/upgrade-main.yml", 10 | ".gitignore", 11 | ".mergify.yml", 12 | ".projen/deps.json", 13 | ".projen/files.json", 14 | ".projen/tasks.json", 15 | "LICENSE", 16 | "tsconfig.dev.json" 17 | ], 18 | "//": "~~ Generated by projen. To modify, edit .projenrc.js and run \"npx projen\"." 19 | } 20 | -------------------------------------------------------------------------------- /.projenrc.js: -------------------------------------------------------------------------------- 1 | const { awscdk } = require('projen'); 2 | const project = new awscdk.AwsCdkConstructLibrary({ 3 | author: 'Martin Schade', 4 | authorAddress: '45048633+schadem@users.noreply.github.com', 5 | cdkVersion: '2.135.0', 6 | jsiiVersion: '~5.3.0', 7 | defaultReleaseBranch: 'main', 8 | name: 'amazon-textract-idp-cdk-constructs', 9 | repositoryUrl: 'https://github.com/aws-samples/amazon-textract-idp-cdk-constructs.git', 10 | gitignore: ['test/__snapshots__/', '__pycache__', '.python-version', '.aws-sam', '.nvmrc', '.vscode', 'update_dependencies_local.sh', '.idea'], 11 | devDeps: ['aws-cdk-lib@^2.135.0', 'jsii-rosetta@^5.3.0'], 12 | peerDeps: ['aws-cdk-lib@^2.135.0'], 13 | keywords: ['aws-cdk', 'schadem', 'textract', 'amazon-textract', 'idp'], 14 | license: 'MIT-0', 15 | copyrightPeriod: '2022-', 16 | copyrightOwner: 'Amazon.com, Inc. or its affiliates. All Rights Reserved.', 17 | release: true, 18 | publishToPypi: { 19 | distName: 'amazon-textract-idp-cdk-constructs', 20 | module: 'amazon_textract_idp_cdk_constructs', 21 | prePublishSteps: [ 22 | { run: 'mv dist .repo' }, 23 | { run: 'cd .repo && yarn install --check-files --frozen-lockfile' }, 24 | { run: 'python -m pip install --upgrade pip' }, 25 | { run: 'pip install --upgrade setuptools' }, 26 | { run: 'cd .repo && npx projen package:python' }, 27 | { run: 'mv .repo/dist dist' }, 28 | ], 29 | }, 30 | python: { 31 | distName: 'amazon-textract-idp-cdk-constructs', 32 | module: 'amazon_textract_idp_cdk_constructs', 33 | }, 34 | publishToMaven: { 35 | javaPackage: 'software.amazon.textract.idp', 36 | mavenArtifactId: 'idp-cdk-constructs', 37 | mavenGroupId: 'software.amazon.textract.idp', 38 | mavenServerId: 'ossrh', 39 | serverId: 'MavenCentral', 40 | mavenEndpoint: 'https://aws.oss.sonatype.org/', 41 | mavenRepositoryUrl: 'https://aws.oss.sonatype.org/', 42 | }, 43 | }); 44 | project.synth(); 45 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "lambda" 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true, 7 | "python.defaultInterpreterPath": "~/.pyenv/versions/3.11.2/envs/samples/bin/python" 8 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /Config: -------------------------------------------------------------------------------- 1 | package.Amazon-textract-idp-cdk-constructs = { 2 | interfaces = (1.0); 3 | 4 | # Use NoOpBuild. See https://w.amazon.com/index.php/BrazilBuildSystem/NoOpBuild 5 | build-system = no-op; 6 | build-tools = { 7 | 1.0 = { 8 | NoOpBuild = 1.0; 9 | }; 10 | }; 11 | 12 | # Use runtime-dependencies for when you want to bring in additional 13 | # packages when deploying. 14 | # Use dependencies instead if you intend for these dependencies to 15 | # be exported to other packages that build against you. 16 | dependencies = { 17 | 1.0 = { 18 | }; 19 | }; 20 | 21 | runtime-dependencies = { 22 | 1.0 = { 23 | }; 24 | }; 25 | 26 | }; 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022- Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /amazon-textract-idp-cdk-stack-samples.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "../amazon-textract-idp-cdk-stack-samples" 5 | }, 6 | { 7 | "path": "." 8 | } 9 | ], 10 | "settings": { 11 | "python.analysis.autoImportCompletions": true, 12 | "python.analysis.extraPaths": [ 13 | "~/.pyenv/versions/3.11.2/envs/samples/bin/python" 14 | ], 15 | "python.analysis.include": [ 16 | "~/.pyenv/versions/3.11.2/envs/samples/bin/python" 17 | ] 18 | } 19 | } -------------------------------------------------------------------------------- /images/Workmail_Lambda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/images/Workmail_Lambda.png -------------------------------------------------------------------------------- /images/email_rule_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/images/email_rule_1.png -------------------------------------------------------------------------------- /images/email_rule_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/images/email_rule_2.png -------------------------------------------------------------------------------- /images/email_rule_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/images/email_rule_3.png -------------------------------------------------------------------------------- /lambda/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/.DS_Store -------------------------------------------------------------------------------- /lambda/a2i_postprocess/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/a2i_postprocess/app/main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | import os 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | dynamo_db_client = boto3.client("dynamodb") 9 | step_functions_client = boto3.client(service_name='stepfunctions') 10 | 11 | # s3 = boto3.resource('s3') 12 | 13 | 14 | #Post processing lambda function. Post annotated coordinates to the DDB after the human review. 15 | def lambda_handler(event, _): 16 | log_level = os.environ.get('LOG_LEVEL', 'INFO') 17 | logger.setLevel(log_level) 18 | logger.info(json.dumps(event)) 19 | 20 | token_store_ddb = os.environ.get('TOKEN_STORE_DDB', None) 21 | if not token_store_ddb: 22 | raise Exception("no TOKEN_STORE_DDB set") 23 | 24 | logger.info( 25 | f"LOG_LEVEL: {log_level} \n TOKEN_STORE_DDB: {token_store_ddb} \n ") 26 | 27 | human_loop_name = event['detail']['humanLoopName'] 28 | human_loop_result_path = event['detail']["humanLoopOutput"]["outputS3Uri"] 29 | human_loop_creation_time = event['detail']["creationTime"] 30 | human_loop_status = event['detail']["humanLoopStatus"] 31 | human_loop_failure_reason = "" 32 | if 'failureReason' in event['detail']: 33 | human_loop_failure_reason = event['detail']['failureReason'] 34 | human_loop_failure_code = "" 35 | if 'failureCode' in event['detail']: 36 | human_loop_failure_code = event['detail']['failureCode'] 37 | ddb_response = dynamo_db_client.get_item( 38 | TableName=token_store_ddb, Key={"ID": { 39 | 'S': human_loop_name 40 | }}) 41 | logger.debug(f"ddb_response: {ddb_response}") 42 | task_token = ddb_response['Item']['Token']['S'] 43 | 44 | # url = urlparse.urlparse(human_loop_result) 45 | # bucket = url.netloc 46 | # key = url.path 47 | # content_object = s3.Object(bucket, key.lstrip("/")) 48 | # file_content = content_object.get()['Body'].read().decode('utf-8') 49 | # human_loop_output_json = json.loads(file_content) 50 | # human_answers = human_loop_output_json["humanAnswers"] 51 | 52 | response = { 53 | 'humanLoopStatus': human_loop_status, 54 | 'humanLoopResultPath': human_loop_result_path, 55 | 'humanLoopCreationTime': human_loop_creation_time, 56 | } 57 | if human_loop_status == 'Failed': 58 | step_functions_client.send_task_failure( 59 | taskToken=task_token, 60 | error=human_loop_failure_reason[:250], 61 | cause=human_loop_failure_code) 62 | else: 63 | step_functions_client.send_task_success(taskToken=task_token, 64 | output=json.dumps(response)) 65 | -------------------------------------------------------------------------------- /lambda/a2i_postprocess/app/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/a2i_postprocess/app/requirements.txt -------------------------------------------------------------------------------- /lambda/a2i_postprocess/app/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | while getopts u:a:f: flag 4 | do 5 | case "${flag}" in 6 | u) username=${OPTARG};; 7 | a) age=${OPTARG};; 8 | f) fullname=${OPTARG};; 9 | esac 10 | done 11 | echo "Username: $username"; 12 | echo "Age: $age"; 13 | echo "Full Name: $fullname"; 14 | 15 | 16 | -------------------------------------------------------------------------------- /lambda/a2i_postprocess/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "HelloWorldFunction": { 3 | "S3_OUTPUT_PREFIX": "textract-output", 4 | "S3_OUTPUT_BUCKET": "schademcdkstackpaystubst-schademcdkidpstackpython-1fvi0dqoz24lj", 5 | "SQS_QUEUE_URL": "SchademCdkStackPaystubStack-textractsynctaskSyncRequestsBC26E72B-kID9dZUtmCZM" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /lambda/a2i_postprocess/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "Records": [ 3 | { 4 | "messageId": "e9f32640-f517-422a-b22b-aa5f9630b545", 5 | "receiptHandle": "AQEBFsALOXYR+wBoaOuipqckuug4nzo9nZu3kQn6/29y3fGqgCZA4qC2rmBQh4Gwk1aMRZw1Ln36WAuua11krVJk965MLskbZX+ssop0PmoOfKIO/amBDVKii1GoONjSDf4iEjskylhl6hsWerMbZVV9EtB7Ns3DwjfSGW/KjkrFXq0xSU3E6kJQyvT4mL/QYyZxMQioLmcFVIEUEXjJAiXGdkCvA07WA2ZxGxiMNg5xGCuQJZweG8Lu/jGVuz8OGP3pra6rI2PsLBYuQyi2wuXjPC08hThV82O/FMPwMK8q/LgoHQ0LqSHPIP4tzC2C6qwc9POPxcVStMKcwXmqFVAWu8mi9Qeuen/OYi7LE7hqsSWBopKmtH+EKOuXufqvzH6oIbHJbTYe8tMA9Wo/k3vMpZdoyy1YTU5NQRxBMP3JT0dFdUSRlLug/teEzVQphqwS78xsVArIflkEaOIVRMnQ2bF/WelPLW8AxEEZii4eI4E=", 6 | "body": "{\"Token\": \"AQCcAAAAKgAAAAMAAAAAAAAAAYLAykXaD0L6sxF5tTZdGf4fFXbifBuRyL9GYJQgebAV6p/Z6RWmH3SA+sXA9AnHizaVC/1Hk8WJ+9M0v2+ns1rKhG+VRqc0C+zdxZApsXgiqVXczaDMdAZtR7Tl16WY5sgPKIc=5bN+WqJIdPClk/vy94JveNPfoS2k4dvoTRBknpSWkEIluL1PftteQID4q7FpVhdgtemfWLgZGh+quOA+8ZL3hJVVCYvlsU8/R3jv2XpTCvWTB3+F1okR4v27I/37V8C5MppyGmM4IrHmiYQcUjxAI8oks+RAZNVXaQbzg/qWadyx1KOmXG55iA9FDSx3Z04mNOh3WoVKZXqGmjjqw6q1S/zZXor+KIZO4ZwygwGnR93o4uaxhbG+nzMWyIRtEdqHKVx5SWGDbVysSj1p7h4ITM/pAq2SKj6aNtbjpulkuN/YTaNSk3PqYYLnWKP37+1KZr457zh8kCEwhoSfGwSHb0KXobkag9BnUZHADcvSvm0tePNr1ucdWYJrnEq1El6AfjOagYdEF5fMGVB7Koqy4PtP1wh4IWPaklbdQDZ5LNlpNNfUVRqTiR2pbhb+myW8BGamIm+KSES4W5/U6GxyCk1cAQovCM4BMTFOuv5uvbJMGe9BYEn+Vqq2a/6+yzUUS3CvKoVtqmx8wgdNdyOd\", \"Payload\": {\"manifest\": {\"textractFeatures\": [\"FORMS\", \"QUERIES\"], \"s3Path\": \"s3://schademcdkstackpaystuban-schademcdkidpstackpaystu-bt0j5wq0zftu/uploads/000429.pdf-1.jpg\", \"queriesConfig\": [{\"alias\": \"W2_FORM_YEAR\", \"text\": \"What is the form year ?\"}, {\"alias\": \"W2_FORM_TYPE\", \"text\": \"What is the form type ?\"}, {\"alias\": \"W2_EMPLOYEE_SSN\", \"text\": \"What is the Employee SSN ?\"}, {\"alias\": \"W2_EMPLOYER_NAME\", \"text\": \"What is the Employer Name ?\"}, {\"alias\": \"W2_WAGES_TIPS_OTHER\", \"text\": \"What is wages, tips, other compensation amount ?\"}, {\"alias\": \"W2_FEDERAL_INCOME_TAX\", \"text\": \"What is the Federal Income Tax withheld amount ?\"}, {\"alias\": \"W2_SS_WAGES\", \"text\": \"What is the social security wages amount ?\"}, {\"alias\": \"W2_SS_TAX\", \"text\": \"What is the social security Taxes withheld amount ?\"}, {\"alias\": \"W2_12a_VALUE_TYPE\", \"text\": \"What is the value type in Box 12 a ?\"}, {\"alias\": \"W2_12a_VALUE_AMOUNT\", \"text\": \"What is the value amount in Box 12 a ?\"}, {\"alias\": \"W2_12b_VALUE_TYPE\", \"text\": \"What is the value type in Box 12 b ?\"}, {\"alias\": \"W2_12b_VALUE_AMOUNT\", \"text\": \"What is the value amount in Box 12 b ?\"}, {\"alias\": \"W2_12c_VALUE_TYPE\", \"text\": \"What is the value type in Box 12 c ?\"}, {\"alias\": \"W2_13_STATUTORY\", \"text\": \"Is Box 13 Statutory employee selected ?\"}, {\"alias\": \"W2_13_RETIREMENT_PLAN\", \"text\": \"Is Box 13 Retirement plan selected ?\"}, {\"alias\": \"W2_13_THIRD_PARTY_SICK_PAY\", \"text\": \"Is Box 13 Third - party sick pay selected ?\"}]}, \"mime\": \"image/jpeg\", \"classification\": {\"documentType\": \"AWS_W2\"}, \"numberOfPages\": 1, \"Random\": {\"randomNumber\": 20}, \"textract_result\": {\"TextractOutputJsonPath\": \"s3://schademcdkstackpaystuban-schademcdkidpstackpaystu-bt0j5wq0zftu/textract-output/000429.pdf-12022-06-14T01:19:14.763537/000429.pdf-1.json\"}, \"txt_output_location\": {\"TextractOutputCSVPath\": \"s3://schademcdkstackpaystuban-schademcdkidpstackpaystu-bt0j5wq0zftu/txt_output/2022-06-14T01:19:17+00:00/000429.pdf-1.txt\"}}, \"ExecutionId\": \"arn:aws:states:us-east-1:913165245630:execution:PaystubW2WorkflowPythonFCA0DA8F-FBUPIeYaS6Qb:000429pdf-1jpg2022-06-14T011907975165\"}", 7 | "attributes": { 8 | "ApproximateReceiveCount": "1", 9 | "SentTimestamp": "1655169563470", 10 | "SenderId": "AROA5JHHD3S7JITUAYAMG:SchademCdkStackPaystubAnd-textractsynctaskwithconf-hmuPNa9kDDSw", 11 | "ApproximateFirstReceiveTimestamp": "1655169563479" 12 | }, 13 | "messageAttributes": {}, 14 | "md5OfBody": "f65e89960a426fbe9ca091f2690a52b5", 15 | "eventSource": "aws:sqs", 16 | "eventSourceARN": "arn:aws:sqs:us-east-1:913165245630:SchademCdkStackPaystubAndW2Stack-textractsynctaskwithconfigSyncRequ-WeRe6ALWgek7", 17 | "awsRegion": "us-east-1" 18 | } 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /lambda/a2i_postprocess/events/simple-event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "S3Path": "s3://sdx-textract-us-east-1/employeeapp20210510.png" 4 | }, 5 | "mime": "image/png", 6 | "numberOfPages": 1 7 | } 8 | -------------------------------------------------------------------------------- /lambda/a2i_postprocess/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | SyncFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | S3_OUTPUT_PREFIX: textract-output 22 | S3_OUTPUT_BUCKET: schademcdkstackpaystubst-schademcdkidpstackpython-1fvi0dqoz24lj 23 | SQS_QUEUE_URL: SchademCdkStackPaystubStack-textractsynctaskSyncRequestsBC26E72B-kID9dZUtmCZM 24 | Metadata: 25 | Dockerfile: Dockerfile 26 | DockerContext: . 27 | DockerTag: python3.9-v1 28 | 29 | -------------------------------------------------------------------------------- /lambda/a2i_postprocess/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/a2i_preprocess/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/a2i_preprocess/app/main.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import logging 3 | import json 4 | import os 5 | import uuid 6 | from datetime import datetime, timedelta 7 | import time 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | region = os.environ['AWS_REGION'] 12 | dynamo_db_client = boto3.client("dynamodb") 13 | step_functions_client = boto3.client(service_name='stepfunctions') 14 | a2i = boto3.client('sagemaker-a2i-runtime') 15 | """ 16 | Preprocessing Lamda Function for Creating a Human Review Loop 17 | - Prepopulate DynamoDB for Review Job 18 | - Create A2I Human Review Loop 19 | """ 20 | 21 | 22 | def lambda_handler(event, _): 23 | log_level = os.environ.get('LOG_LEVEL', 'INFO') 24 | logger.setLevel(log_level) 25 | logger.info(json.dumps(event)) 26 | 27 | a2i_flow_definition_arn = os.environ.get('A2I_FLOW_DEFINITION_ARN') 28 | logger.info(f"A2I_FLOW_DEFINITION_ARN: {a2i_flow_definition_arn}") 29 | token_store_ddb = os.environ.get('TOKEN_STORE_DDB', None) 30 | if not token_store_ddb: 31 | raise Exception("no TOKEN_STORE_DDB set") 32 | 33 | logger.info(f"LOG_LEVEL: {log_level} \n \ 34 | TOKEN_STORE_DDB: {token_store_ddb} \n \ 35 | A2I_FLOW_DEFINITION_ARN: {a2i_flow_definition_arn}") 36 | 37 | task_token = event['Token'] 38 | execution_id = event['ExecutionId'] 39 | 40 | try: 41 | if 'Payload' in event and 'a2iInputPath' in event['Payload']: 42 | # FIXME: hard coded result location 43 | a2i_input_path = event['Payload']['a2iInputPath'] 44 | else: 45 | a2i_input_path = "" 46 | if a2i_flow_definition_arn == "DEV" or a2i_input_path == "DEV": 47 | response = { 48 | 'HumanLoopCreation': 49 | 'skipped (a2i_input_path and/or a2i_flow_definitiona are "None". doing development testing)', 50 | 'human_loop_result': "DEV" 51 | } 52 | step_functions_client.send_task_success( 53 | taskToken=task_token, output=json.dumps(response)) 54 | 55 | response = {'HumanLoopCreation': 'Failure'} 56 | 57 | uuid_key = str(uuid.uuid4()) 58 | loop = { 59 | 'humanLoopName': uuid_key, 60 | 'imageTime': str(datetime.now().timestamp()).replace(".", ""), 61 | 'humanLoopStatus': 'Pending', 62 | 'humanAnswers': [], 63 | 'taskObject': a2i_input_path, 64 | } 65 | 66 | logger.debug(f"uuid_key: {uuid_key}") 67 | ddb_response = dynamo_db_client.put_item( 68 | TableName=token_store_ddb, 69 | Item={ 70 | "ID": { 71 | 'S': uuid_key 72 | }, 73 | "Type": { 74 | 'S': "A2I" 75 | }, 76 | "Token": { 77 | 'S': task_token 78 | }, 79 | "WorkflowId": { 80 | 'S': execution_id 81 | }, 82 | "ttltimestamp": { 83 | 'N': 84 | str( 85 | int(time.time()) + 86 | int(timedelta(days=7).total_seconds())) 87 | } 88 | }) 89 | logger.debug(f"ddb_response: {ddb_response}") 90 | logger.debug(f"loop: {json.dumps(loop)}") 91 | if a2i_flow_definition_arn != "None": 92 | res = a2i.start_human_loop( 93 | HumanLoopName = uuid_key, 94 | FlowDefinitionArn = a2i_flow_definition_arn, 95 | HumanLoopInput = \ 96 | { 97 | 'InputContent' : json.dumps(loop) 98 | } 99 | ) 100 | logger.info(f'A2I Response: {res}') 101 | response = {'HumanLoopCreation': 'Success'} 102 | else: 103 | response = { 104 | 'HumanLoopCreation': 105 | 'skipped (doing development testing, passing through Textract post processing json)', 106 | 'human_loop_result': "some text" 107 | } 108 | step_functions_client.send_task_success( 109 | taskToken=task_token, output=json.dumps(response)) 110 | return response 111 | except Exception as e: 112 | logger.error(e, exc_info=True) 113 | step_functions_client.send_task_failure(taskToken=task_token, 114 | error=str(type(e)), 115 | cause=str(e)[:250]) 116 | -------------------------------------------------------------------------------- /lambda/a2i_preprocess/app/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/a2i_preprocess/app/requirements.txt -------------------------------------------------------------------------------- /lambda/a2i_preprocess/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "A2IFunction": { 3 | "A2I_FLOW_DEFINITION_ARN": "arn:aws:sagemaker:us-east-1:913165245630:flow-definition/simplea2iworkflow", 4 | "LOG_LEVEL": "DEBUG", 5 | "TOKEN_STORE_DDB": "POC-TextractA2IA2ITaskTokenTable63B01979-OV3F8H9GOZ7O" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /lambda/a2i_preprocess/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "Token": "AQCIAAAAKgAAAAMAAAAAAAAAAc3tS2DnL8l0YMCb6ZtHTTJFB+QkEbcrpuWuPJqbi6cD+pIwVfyDoY8mf4C6ElMgGkX4/JM7TJw33EzoxPo6oW66eSYclQE3afRqp5DOQPfaokkp+xqpvumWg7MYcXYofDyoVRoUrNcd3TJw/eGwevGyrAVhV/rn+ushFk/NiXNHD1fJQv6qPrX4AykbC6pHQk8DSnxElhy+JjO/Ncou2RPWAkZzkzSX7s1Ozu005owMoGbCN1IGAuQTa2Tz6vn2OikOrkiXGRTJOSbc9OSYeopalvRgjTgnQznNXnPs9e1q4IGCJM9PIsXrcnFBTXhBtpvYCArMj+RvduQt9L/wrfJg6IC0HCHKHkoXH5jRQ5km7BGk7WUskqjyJTF/W9+7cVoEPA6eKyxhCTjjpe9qdNxZMUhM2ysaqfXO+whM74EN2zgNqOVesVnKD6SBr306E/O3B2KSMCBgTkRKJy3ANGA2ZZnDxazza24/Tdbrqcpx2m1nrcQLTF7g4qNmNiMRGJoNBVj8HAccf+PUKHDuOOzrW/h1I3whsYtp6e0OxUhP5cBfnGisTOlk8LKbpp7X9XByoRDRhEV6qKxa+EjVBG1P8HvDUqkCyYMzQ9YDobBurxLmbByLK2u/ADPy9Lw2RGKngNtB", 3 | "Payload": { 4 | "manifest": { 5 | "s3Path": "s3://poc-schademcdkidpstackpaystubw2b23e1d7e-1tdayat9yter1/insurance-uploads/Paystub_1_reMars.png" 6 | }, 7 | "mime": "image/png", 8 | "classification": null, 9 | "numberOfPages": 1, 10 | "textract_result": { 11 | "TextractOutputJsonPath": "s3://poc-schademcdkidpstackpaystubw2b23e1d7e-1tdayat9yter1/textract-output/Paystub_1_reMars2022-06-30T23:20:29.549375/Paystub_1_reMars.json" 12 | } 13 | }, 14 | "ExecutionId": "arn:aws:states:us-east-1:913165245630:execution:InsuranceB9D5EC45-EJHxIZQo3SAu:Paystub_1_reMarspng2022-06-30T232022920390" 15 | } 16 | -------------------------------------------------------------------------------- /lambda/a2i_preprocess/events/simple-event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "S3Path": "s3://sdx-textract-us-east-1/employeeapp20210510.png" 4 | }, 5 | "mime": "image/png", 6 | "numberOfPages": 1 7 | } 8 | -------------------------------------------------------------------------------- /lambda/a2i_preprocess/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | A2IFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | A2I_FLOW_DEFINITION_ARN: arn:aws:sagemaker:us-east-1:913165245630:flow-definition/simplea2iworkflow 22 | LOG_LEVEL: DEBUG 23 | TOKEN_STORE_DDB: POC-TextractA2IA2ITaskTokenTable63B01979-OV3F8H9GOZ7O 24 | Metadata: 25 | Dockerfile: Dockerfile 26 | DockerContext: . 27 | DockerTag: python3.9-v1 28 | 29 | -------------------------------------------------------------------------------- /lambda/a2i_preprocess/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/async_to_json/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | RUN python -m pip install amazon-textract-caller==0.0.29 amazon-textract-idp-cdk-manifest marshmallow --upgrade --target "${LAMBDA_TASK_ROOT}" 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/async_to_json/app/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import logging 6 | import os 7 | import time 8 | import boto3 9 | import textractcaller as tc 10 | import textractmanifest as tm 11 | import uuid 12 | 13 | from urllib.parse import urlparse 14 | from botocore.config import Config 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | config = Config(retries={'max_attempts': 0, 'mode': 'standard'}) 19 | 20 | region = os.environ['AWS_REGION'] 21 | step_functions_client = boto3.client(service_name='stepfunctions') 22 | s3 = boto3.client(service_name='s3') 23 | 24 | __version__ = "0.0.1" 25 | 26 | 27 | def lambda_handler(event, _): 28 | log_level = os.environ.get('LOG_LEVEL', 'INFO') 29 | logger.setLevel(log_level) 30 | logger.info(json.dumps(event)) 31 | logger.info(f"version: {__version__}\n \ 32 | textractmanifest version: {tm.__version__}\n \ 33 | boto3 version: {boto3.__version__}\n \ 34 | textractcaller version: {tc.__version__}.") 35 | 36 | textract_api = os.environ.get('TEXTRACT_API', "GENERIC") 37 | s3_output_bucket = os.environ.get('S3_OUTPUT_BUCKET', None) 38 | if not s3_output_bucket: 39 | raise Exception("no S3_OUTPUT_BUCKET set") 40 | 41 | s3_output_prefix = os.environ.get('S3_OUTPUT_PREFIX', None) 42 | if not s3_output_prefix: 43 | raise Exception("no S3_OUTPUT_PREFIX set") 44 | 45 | logger.info(f"LOG_LEVEL: {log_level} \n \ 46 | S3_OUTPUT_PREFIX: {s3_output_prefix} \n \ 47 | S3_OUTPUT_BUCKET: {s3_output_bucket} \n \ 48 | TEXTRACT_API: {textract_api} \ 49 | ") 50 | 51 | manifest: tm.IDPManifest = tm.IDPManifestSchema().load( 52 | event['manifest']) # type: ignore 53 | output_location = event['textract_result']['TextractTempOutputJsonPath'] 54 | oc_s3_bucket = urlparse(output_location).netloc 55 | job_id = os.path.basename(urlparse(output_location).path) 56 | oc_s3_prefix = os.path.dirname(urlparse(output_location).path) 57 | output_config = tc.OutputConfig(s3_bucket=oc_s3_bucket, 58 | s3_prefix=oc_s3_prefix) 59 | start_time = round(time.time() * 1000) 60 | full_json = None 61 | if textract_api == 'GENERIC': 62 | full_json = tc.get_full_json_from_output_config( 63 | output_config=output_config, job_id=job_id, s3_client=s3) 64 | elif textract_api == 'LENDING': 65 | full_json = tc.get_full_json_lending_from_output_config( 66 | output_config=output_config, 67 | job_id=job_id, 68 | s3_client=s3, 69 | subfolder="detailedResponse") 70 | 71 | s3_filename, _ = os.path.splitext(os.path.basename(manifest.s3_path)) 72 | 73 | call_duration = round(time.time() * 1000) - start_time 74 | logger.info(f"textract_async_to_json_call_duration_in_ms: {call_duration}") 75 | output_bucket_key = os.path.join(s3_output_prefix, str(uuid.uuid4()), 76 | s3_filename + ".json") 77 | 78 | logger.info("before saving to S3") 79 | if not full_json: 80 | raise Exception("no JSON was generated") 81 | s3.put_object(Body=bytes(json.dumps(full_json, indent=4).encode('UTF-8')), 82 | Bucket=s3_output_bucket, 83 | Key=output_bucket_key) 84 | logger.info("after saving to S3") 85 | 86 | event["textract_result"][ 87 | "TextractOutputJsonPath"] = f"s3://{s3_output_bucket}/{output_bucket_key}" 88 | 89 | return event 90 | -------------------------------------------------------------------------------- /lambda/async_to_json/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "HelloWorldFunction": { 3 | "LOG_LEVEL": "DEBUG", 4 | "S3_OUTPUT_BUCKET": "simplesearchpdf-textractsimpleasyncworkflow2d7d5b-j392r3sveake", 5 | "S3_OUTPUT_PREFIX": "textract-output" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /lambda/async_to_json/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "s3Path": "s3://simplesearchpdf-textractsimpleasyncworkflow2d7d5b-j392r3sveake/uploads/moby-dick-hidden-paystub-and-w2.pdf" 4 | }, 5 | "mime": "application/pdf", 6 | "classification": null, 7 | "numberOfPages": 488, 8 | "textract_result": { 9 | "TextractTempOutputJsonPath": "s3://simplesearchpdf-textractsimpleasyncworkflow2d7d5b-j392r3sveake/textract-temp-output/1a016bc3203ac32c0b8c5f587cc58949f2abb0f8612057168dda877a351eff3a" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /lambda/async_to_json/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | MemorySize: 10240 12 | 13 | Resources: 14 | SyncFunction: 15 | Type: AWS::Serverless::Function 16 | Properties: 17 | PackageType: Image 18 | Architectures: 19 | - x86_64 20 | Environment: 21 | Variables: 22 | S3_OUTPUT_PREFIX: textract-output 23 | S3_OUTPUT_BUCKET: simplesearchpdf-textractsimpleasyncworkflow2d7d5b-j392r3sveake 24 | LOG_LEVEL: DEBUG 25 | Metadata: 26 | Dockerfile: Dockerfile 27 | DockerContext: . 28 | DockerTag: python3.9-v1 29 | 30 | -------------------------------------------------------------------------------- /lambda/async_to_json/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/app/default_config.csv: -------------------------------------------------------------------------------- 1 | AWS_PAYSTUBS,"{""queriesConfig"": [{""alias"": ""PAYSTUB_PERIOD_START_DATE"", ""text"": ""What is the Pay Period Start Date?""}, {""alias"": ""PAYSTUB_PERIOD_END_DATE"", ""text"": ""What is the Pay Period End Date?""}, {""alias"": ""PAYSTUB_PERIOD_PAY_DATE"", ""text"": ""What is the Pay Date?""}, {""alias"": ""PAYSTUB_PERIOD_EMPLOYEE_NAME"", ""text"": ""What is the Employee Name?""}, {""alias"": ""PAYSTUB_PERIOD_COMPANY_NAME"", ""text"": ""What is the company Name?""}, {""alias"": ""PAYSTUB_PERIOD_CURRENT_GROSS_PAY"", ""text"": ""What is the Current Gross Pay?""}, {""alias"": ""PAYSTUB_PERIOD_YTD_GROSS_PAY"", ""text"": ""What is the YTD Gross Pay?""}, {""alias"": ""PAYSTUB_PERIOD_REGULAR_HOURLY_RATE"", ""text"": ""What is the regular hourly rate?""}, {""alias"": ""PAYSTUB_PERIOD_HOLIDAY_RATE"", ""text"": ""What is the holiday rate?""}], ""textractFeatures"": [""QUERIES""]}" 2 | AWS_W2,"{""queriesConfig"": [{""alias"": ""W2_FORM_YEAR"", ""text"": ""What is the form year?""}, {""alias"": ""W2_EMPLOYEE_SSN"", ""text"": ""What is the Employee SSN?""}, {""alias"": ""W2_EMPLOYER_NAME"", ""text"": ""What is the Employer Name?""}, {""alias"": ""W2_WAGES_TIPS_OTHER"", ""text"": ""What is wages, tips, other compensation amount?""}, {""alias"": ""W2_FEDERAL_INCOME_TAX"", ""text"": ""What is the Federal Income Tax withheld amount?""}, {""alias"": ""W2_SS_WAGES"", ""text"": ""What is the social security wages amount?""}, {""alias"": ""W2_SS_TAX"", ""text"": ""What is the social security Taxes withheld amount?""}, {""alias"": ""W2_12a_VALUE_TYPE"", ""text"": ""What is the value type in Box 12a?""}, {""alias"": ""W2_12a_VALUE_AMOUNT"", ""text"": ""What is the value amount in Box 12a?""}, {""alias"": ""W2_12b_VALUE_TYPE"", ""text"": ""What is the value type in Box 12b?""}, {""alias"": ""W2_12b_VALUE_AMOUNT"", ""text"": ""What is the value amount in Box 12b?""}, {""alias"": ""W2_12c_VALUE_TYPE"", ""text"": ""What is the value type in Box 12c?""}, {""alias"": ""W2_13_STATUTORY"", ""text"": ""Is Box 13 Statutory employee selected?""}, {""alias"": ""W2_13_RETIREMENT_PLAN"", ""text"": ""Is Box 13 Retirement plan selected?""}, {""alias"": ""W2_13_THIRD_PARTY_SICK_PAY"", ""text"": ""Is Box 13 Third-party sick pay selected?""}], ""textractFeatures"": [""FORMS"", ""QUERIES""]}" 3 | AWS_BANK_STATEMENTS,"{""queriesConfig"": [{""alias"": ""AWS_BANK_STATEMENTS_ACCOUNT_NAME"", ""text"": ""What is the Customer/Account Name?""}, {""alias"": ""AWS_BANK_STATEMENTS_BANK_NAME"", ""text"": ""What is the Bank Name?""}, {""alias"": ""AWS_BANK_STATEMENTS_ACCOUNT_NUMBER"", ""text"": ""What is the Account Number?""}, {""alias"": ""AWS_BANK_STATEMENTS_ACCOUNT_TYPE"", ""text"": ""What is the Account Type?""}, {""alias"": ""AWS_BANK_STATEMENTS_BEGINNING_BALANCE"", ""text"": ""What is the Beginning Balance?""}, {""alias"": ""AWS_BANK_STATEMENTS_TOTAL_DEPOSITS"", ""text"": ""What is the Total Deposits?""}, {""alias"": ""AWS_BANK_STATEMENTS_TOTAL_WITHDRAWALS"", ""text"": ""What is the Total Withdrawal?""}, {""alias"": ""AWS_BANK_STATEMENTS_ENDING_BALANCE"", ""text"": ""What is the Ending balance?""}, {""alias"": ""AWS_BANK_STATEMENTS_AVERAGE_BALANCE"", ""text"": ""What is the Average balance?""}], ""textractFeatures"": [""QUERIES""]}" 4 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/app/generate_csv.py: -------------------------------------------------------------------------------- 1 | import textractmanifest as tm 2 | import csv 3 | from typing import List 4 | 5 | # AWS_PAYSTUBS 6 | paystub_query_list: List[List[str]] = [ 7 | ["PAYSTUB_PERIOD_START_DATE", "What is the Pay Period Start Date?"], 8 | ["PAYSTUB_PERIOD_END_DATE", "What is the Pay Period End Date?"], 9 | ["PAYSTUB_PERIOD_PAY_DATE", "What is the Pay Date?"], 10 | ["PAYSTUB_PERIOD_EMPLOYEE_NAME", "What is the Employee Name?"], 11 | ["PAYSTUB_PERIOD_COMPANY_NAME", "What is the company Name?"], 12 | ["PAYSTUB_PERIOD_CURRENT_GROSS_PAY", "What is the Current Gross Pay?"], 13 | ["PAYSTUB_PERIOD_YTD_GROSS_PAY", "What is the YTD Gross Pay?"], 14 | ["PAYSTUB_PERIOD_REGULAR_HOURLY_RATE", "What is the regular hourly rate?"], 15 | ["PAYSTUB_PERIOD_HOLIDAY_RATE", "What is the holiday rate?"] 16 | ] 17 | 18 | queries: List[tm.Query] = list() 19 | for query in paystub_query_list: 20 | queries.append(tm.Query(alias=query[0], text=query[1])) 21 | 22 | paystub_manifest = tm.IDPManifest(queries_config=queries, 23 | textract_features=['QUERIES']) 24 | 25 | # W2 26 | 27 | w2_query_list: List[List[str]] = [ 28 | ['W2_FORM_YEAR', 'What is the form year?'], 29 | ['W2_FORM_TYPE', 'What is the form type?'], 30 | ['W2_EMPLOYEE_SSN', 'What is the Employee SSN?'], 31 | ['W2_EMPLOYER_NAME', 'What is the Employer Name?'], 32 | ['W2_WAGES_TIPS_OTHER', 'What is wages, tips, other compensation amount?'], 33 | [ 34 | 'W2_FEDERAL_INCOME_TAX', 35 | 'What is the Federal Income Tax withheld amount?' 36 | ], 37 | ['W2_SS_WAGES', 'What is the social security wages amount?'], 38 | ['W2_SS_TAX', 'What is the social security Taxes withheld amount?'], 39 | ['W2_12a_VALUE_TYPE', 'What is the value type in Box 12a?'], 40 | ['W2_12a_VALUE_AMOUNT', 'What is the value amount in Box 12a?'], 41 | ['W2_12b_VALUE_TYPE', 'What is the value type in Box 12b?'], 42 | ['W2_12b_VALUE_AMOUNT', 'What is the value amount in Box 12b?'], 43 | ['W2_12c_VALUE_TYPE', 'What is the value type in Box 12c?'], 44 | ['W2_13_STATUTORY', 'Is Box 13 Statutory employee selected?'], 45 | ['W2_13_RETIREMENT_PLAN', 'Is Box 13 Retirement plan selected?'], 46 | ['W2_13_THIRD_PARTY_SICK_PAY', 'Is Box 13 Third-party sick pay selected?'], 47 | ] 48 | 49 | queries: List[tm.Query] = list() 50 | for query in w2_query_list: 51 | queries.append(tm.Query(alias=query[0], text=query[1])) 52 | 53 | w2_manifest = tm.IDPManifest(queries_config=queries, 54 | textract_features=['FORMS', 'QUERIES']) 55 | 56 | bank_statements_list: List[List[str]] = [ 57 | ['AWS_BANK_STATEMENTS_ACCOUNT_NAME', 'What is the Customer/Account Name?'], 58 | ['AWS_BANK_STATEMENTS_BANK_NAME', 'What is the Bank Name?'], 59 | ['AWS_BANK_STATEMENTS_ACCOUNT_NUMBER', 'What is the Account Number?'], 60 | ['AWS_BANK_STATEMENTS_ACCOUNT_TYPE', 'What is the Account Type?'], 61 | [ 62 | 'AWS_BANK_STATEMENTS_BEGINNING_BALANCE', 63 | 'What is the Beginning Balance?' 64 | ], 65 | ['AWS_BANK_STATEMENTS_TOTAL_DEPOSITS', 'What is the Total Deposits?'], 66 | ['AWS_BANK_STATEMENTS_TOTAL_WITHDRAWALS', 'What is the Total Withdrawal?'], 67 | [ 68 | 'AWS_BANK_STATEMENTS_ENDING_BALANCE', 69 | 'What is the Ending balance?', 70 | ], 71 | [ 72 | 'AWS_BANK_STATEMENTS_AVERAGE_BALANCE', 73 | 'What is the Average balance?', 74 | ], 75 | ] 76 | queries: List[tm.Query] = list() 77 | for query in bank_statements_list: 78 | queries.append(tm.Query(alias=query[0], text=query[1])) 79 | 80 | bank_statements_manifest = tm.IDPManifest(queries_config=queries, 81 | textract_features=['QUERIES']) 82 | 83 | # WRITE default_config.csv 84 | with open("default_config.csv", 'w') as output_csv: 85 | csv_writer = csv.writer(output_csv, quoting=csv.QUOTE_MINIMAL) 86 | 87 | csv_writer.writerow( 88 | ['AWS_PAYSTUBS', 89 | tm.IDPManifestSchema().dumps(paystub_manifest)]) 90 | csv_writer.writerow(['AWS_W2', tm.IDPManifestSchema().dumps(w2_manifest)]) 91 | csv_writer.writerow([ 92 | 'AWS_BANK_STATEMENTS', 93 | tm.IDPManifestSchema().dumps(bank_statements_manifest) 94 | ]) 95 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/app/generated_manifest: -------------------------------------------------------------------------------- 1 | {"textractFeatures": ["QUERIES"], "queriesConfig": [{"text": "What is the Pay Period Start Date?", "alias": "PAYSTUB_PERIOD_START_DATE"}, {"text": "What is the Pay Period End Date?", "alias": "PAYSTUB_PERIOD_END_DATE"}, {"text": "What is the Pay Date?", "alias": "PAYSTUB_PERIOD_PAY_DATE"}, {"text": "What is the Employee Name?", "alias": "PAYSTUB_PERIOD_EMPLOYEE_NAME"}, {"text": "What is the company Name?", "alias": "PAYSTUB_PERIOD_COMPANY_NAME"}, {"text": "What is the Current Gross Pay?", "alias": "PAYSTUB_PERIOD_CURRENT_GROSS_PAY"}, {"text": "What is the YTD Gross Pay?", "alias": "PAYSTUB_PERIOD_YTD_GROSS_PAY"}, {"text": "What is the regular hourly rate?", "alias": "PAYSTUB_PERIOD_REGULAR_HOURLY_RATE"}, {"text": "What is the holiday rate?", "alias": "PAYSTUB_PERIOD_HOLIDAY_RATE"}]} -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/app/main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import boto3 4 | import json 5 | import csv 6 | 7 | logger = logging.getLogger(__name__) 8 | __version__ = "0.0.1" 9 | 10 | dynamodb = boto3.resource('dynamodb') 11 | 12 | 13 | def on_event(event, context): 14 | print(f"event: {event}") 15 | print(f"context: {context}") 16 | request_type = event['RequestType'] 17 | if request_type == 'Create': return on_create(event) 18 | if request_type == 'Update': return on_update(event) 19 | if request_type == 'Delete': return on_delete(event) 20 | raise Exception("Invalid request type: %s" % request_type) 21 | 22 | 23 | def put_item(table, document_type: str, manifest: str): 24 | ddb_response = table.put_item(Item={ 25 | "DOCUMENT_TYPE": document_type, 26 | "CONFIG": manifest 27 | }) 28 | logger.debug(ddb_response) 29 | 30 | 31 | def on_create(event, context, table_name): 32 | logger.info(f"event: {event}\ncontext: {context}") 33 | physical_id = 'initConfiguration' 34 | table = dynamodb.Table(table_name) 35 | with open('default_config.csv') as default_config_file: 36 | csv_reader = csv.reader(default_config_file) 37 | for row in csv_reader: 38 | put_item(table, row[0], row[1]) 39 | return {'PhysicalResourceId': physical_id} 40 | 41 | 42 | def on_update(event, context): 43 | 44 | physical_id = event["PhysicalResourceId"] 45 | props = event["ResourceProperties"] 46 | print("update resource %s with props %s" % (physical_id)) 47 | 48 | 49 | def on_delete(event, context): 50 | physical_id = event["PhysicalResourceId"] 51 | print("delete resource %s" % physical_id) 52 | 53 | 54 | def lambda_handler(event, context): 55 | log_level = os.environ.get('LOG_LEVEL', 'INFO') 56 | logger.setLevel(log_level) 57 | logger.info(json.dumps(event)) 58 | logger.debug(f"version: {__version__}") 59 | logger.debug(f"boto3 version: {boto3.__version__}") 60 | logger.info(event) 61 | configuration_table = os.environ.get('CONFIGURATION_TABLE', '') 62 | logger.info(f'CONFIGURATION_TABLE: {configuration_table}') 63 | if not configuration_table: 64 | raise ValueError(f'no CONFIGURATION_TABLE defined') 65 | request_type = event['RequestType'].lower() 66 | if request_type == 'create': 67 | return on_create(event=event, 68 | context=context, 69 | table_name=configuration_table) 70 | if request_type == 'update': 71 | return on_update(event=event, context=context) 72 | if request_type == 'delete': 73 | return on_delete(event=event, context=context) 74 | raise Exception(f'Invalid request type: {request_type}') 75 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/app/requirements.txt: -------------------------------------------------------------------------------- 1 | schadem-tidp-manifest 2 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "PutOnSQSFunction": { 3 | "SQS_QUEUE_URL": "https://sqs.us-east-1.amazonaws.com/913165245630/idp-stack-python-sample-1-textractasynctaskStartTPSRequests988C50E9-ObdXs98lMdHc", 4 | "LOG_LEVEL": "DEBUG" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "Token": "AQCUAAAAKgAAAAMAAAAAAAAAAScySGVHlgFmBDVYD6a8ltN6T9sIC2E2c+pzSZMimkamYs7yQqUgz3QBGHVmaWxPE19esi6AhnOy9VAoyLPlY8Mp41xuEOE790EpAnGOpD/dytQLwUiYvRrQ3xc39l4=3OtkLO6CRweuCEt3aL8aLSYaOaBdxzIUP5TXmZcAqZkFrW7v8hrHCmJnfVqyqQnWH+gDmhCG6chzGSt+2TkPmiT7Pzol6bkvzWYjpuytO9drcJMJne58Vd6oLqQ3jUxqd9t01WeTndbHwyV6wE1vp1AuQuSL/3I6PpuQpuoMvJOd82QlQ9rpR8DaY4T4FIH50fyHF6mMARsteDcTzNSuriyKRIFjPhYFNw5MnGPbc5eYvHIxikRzq+fiYCr6bYlXjn5XrG3uz8hYz48EWcHScdNvc3IraqVwf+oXsGO/74x0E+a6StVrP1DmCk0Siof8hCPSehdomEt0+ivXW0mD+GcsfjOOnlDIZ9E2EYrNjl1aCMe7225AgwsqFb48cbh5ekHQMtZhy3VUPhmaygyqMTujGT7kU4JlFRumtPs9D6FW+NkNiE6VjCCNz4Q0/6uH3jtwRkEon4ozbnxkJLQsXujSA52Iig7/0H9vL7ZNe3oeKFIyD0OGQeryetRwkC68xKtbZc626ZLEhXRRSR0Z", 3 | "Payload": { 4 | "manifest": { 5 | "S3Path": "s3://idp-stack-python-sample-schademcdkidpstackpython-pha5ty3i0jbi/uploads/98.blabla" 6 | }, 7 | "mime": "image/jpeg", 8 | "classification": null, 9 | "numberOfPages": 1 10 | }, 11 | "ExecutionId": "arn:aws:states:us-east-1:913165245630:execution:IDPWorkflowPython85B937F9-Pnw2MyHSOXR8:1992869c-cc18-4b3c-91ca-d15fcebdcc14" 12 | } 13 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/events/simple-event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "S3Path": "s3://sdx-textract-us-east-1/employeeapp20210510.png" 4 | }, 5 | "mime": "image/png", 6 | "numberOfPages": 1 7 | } 8 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | PutOnSQSFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | SQS_QUEUE_URL: https://sqs.us-east-1.amazonaws.com/913165245630/testqueue 22 | LOG_LEVEL: DEBUG 23 | Metadata: 24 | Dockerfile: Dockerfile 25 | DockerContext: . 26 | DockerTag: python3.9-v1 27 | 28 | -------------------------------------------------------------------------------- /lambda/cfn_custom_configurator_prefill/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/classification_spacy/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | # RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" 5 | # RUN yum -y install unzip 6 | # RUN unzip awscliv2.zip 7 | # RUN ./aws/install 8 | # RUN aws s3 cp s3://amazon-textract-public-content/constructs/en_textcat_demo-0.0.0.tar.gz . 9 | 10 | COPY en_textcat_demo-0.0.0.tar.gz ${LAMBDA_TASK_ROOT}/ 11 | 12 | RUN python -m pip install amazon-textract-idp-cdk-manifest ${LAMBDA_TASK_ROOT}/en_textcat_demo-0.0.0.tar.gz --target "${LAMBDA_TASK_ROOT}" 13 | 14 | # Copy function code 15 | COPY app/* ${LAMBDA_TASK_ROOT}/ 16 | 17 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 18 | CMD [ "sync_main.lambda_handler" ] 19 | -------------------------------------------------------------------------------- /lambda/classification_spacy/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/classification_spacy/app/requirements.txt: -------------------------------------------------------------------------------- 1 | amazon-textract-caller 2 | -------------------------------------------------------------------------------- /lambda/classification_spacy/en_textcat_demo-0.0.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/classification_spacy/en_textcat_demo-0.0.0.tar.gz -------------------------------------------------------------------------------- /lambda/classification_spacy/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "SpacyFunction": { 3 | "LOG_LEVEL": "DEBUG" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /lambda/classification_spacy/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "Token": "AQCIAAAAKgAAAAMAAAAAAAAAAe1qOReYPEUtzfwSG2UPbC+eb6CHB9/DsZ0HFTlCLT2MPRL/CiqnGT4vBFQVqZzJ1fNzXrI5ndI5ztDrbcCc4htMCJHvuHPXFNkQEF0/+SjC/GRbUox6xj7PzS/dzhHfmarzzsWTv79SMPHJdlFd/AqCrXNxS2jz8z7LC0SzTN7Mj7eoQzHZk69tlfl/yYGkFjkWjqQurRJiCVzOorV1C+x+9bvK+nsgxUzluymWhh23AIP69T+eg3udJicOyFbVCpPONDFoOani6g1EhjpqowUzSoSBWeBimaANi4Lyz0MF+rjpQ7o2W+0M5Hk2TE4CwaZKmFUdyapbQyMSmlZvDlzmJNft/V18QKUKMIf3V2EydwQJYjArSznUCINJjgM/0NnqvwD3KetUCClsZmr2+v++Bv9i9A/G/h3v3WemBwVQlt9P28DwAXRhSH07KnQ4DBoR1UglDxhKF2AqHfCX8BJJybKLpFGF9ZJ9UW13k6C+M2cfwvLSUVWlK3Z4GhomCqOERT6ijDsyMHgJ8VoKHgdZnGMBUjgamUnPJw06jkUBPHdHz8PFmIdzo0s+0SBS5m6O8F8v+GVxqmu7N21q/ckl3tvE2Gxuna65V8s4307wn0MFlDSlomoH/4UitD1Z2OJ1dZ7j", 3 | "Payload": { 4 | "manifest": { 5 | "s3Path": "s3://paystubandw2-s3paystubw2e6324d42-jxb5y4ugc9o5/uploads/W2_5_reMars.png" 6 | }, 7 | "mime": "image/png", 8 | "classification": null, 9 | "numberOfPages": 1, 10 | "Random": { 11 | "randomNumber": 54 12 | }, 13 | "textract_result": { 14 | "TextractTempOutputJsonPath": "s3://paystubandw2-s3paystubw2e6324d42-jxb5y4ugc9o5/textract-temp-output/527320fd13b25c2a7c8a65ead65f23733befe028f369b44c17c60791d43808e6", 15 | "TextractOutputJsonPath": "s3://paystubandw2-s3paystubw2e6324d42-jxb5y4ugc9o5/textract-output/W2_5_reMars2022-07-06T19:46:33.419788/W2_5_reMars.json" 16 | }, 17 | "txt_output_location": { 18 | "TextractOutputCSVPath": "s3://paystubandw2-s3paystubw2e6324d42-jxb5y4ugc9o5/txt_output/2022-07-06T19:46:34+00:00/W2_5_reMars.txt" 19 | } 20 | }, 21 | "ExecutionId": "arn:aws:states:us-east-1:913165245630:execution:PaystubW2ABF3B728-NyjDY8QtG7Qc:W2_5_reMarspng2022-07-06T194617676172" 22 | } 23 | -------------------------------------------------------------------------------- /lambda/classification_spacy/events/simple-event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "S3Path": "s3://sdx-textract-us-east-1/employeeapp20210510.png" 4 | }, 5 | "mime": "image/png", 6 | "numberOfPages": 1 7 | } 8 | -------------------------------------------------------------------------------- /lambda/classification_spacy/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | SpacyFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | LOG_LEVEL: DEBUG 22 | Metadata: 23 | Dockerfile: Dockerfile 24 | DockerContext: . 25 | DockerTag: python3.9-v1 26 | 27 | -------------------------------------------------------------------------------- /lambda/classification_spacy/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/classification_spacy_image/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/j4z6l8i6/amazon-textract-idp-cdk-construct-classification-spacy:latest 2 | 3 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 4 | CMD [ "sync_main.lambda_handler" ] 5 | -------------------------------------------------------------------------------- /lambda/classification_spacy_image/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "SpacyFunction": { 3 | "LOG_LEVEL": "DEBUG" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /lambda/classification_spacy_image/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "Token": "AQCIAAAAKgAAAAMAAAAAAAAAAe1qOReYPEUtzfwSG2UPbC+eb6CHB9/DsZ0HFTlCLT2MPRL/CiqnGT4vBFQVqZzJ1fNzXrI5ndI5ztDrbcCc4htMCJHvuHPXFNkQEF0/+SjC/GRbUox6xj7PzS/dzhHfmarzzsWTv79SMPHJdlFd/AqCrXNxS2jz8z7LC0SzTN7Mj7eoQzHZk69tlfl/yYGkFjkWjqQurRJiCVzOorV1C+x+9bvK+nsgxUzluymWhh23AIP69T+eg3udJicOyFbVCpPONDFoOani6g1EhjpqowUzSoSBWeBimaANi4Lyz0MF+rjpQ7o2W+0M5Hk2TE4CwaZKmFUdyapbQyMSmlZvDlzmJNft/V18QKUKMIf3V2EydwQJYjArSznUCINJjgM/0NnqvwD3KetUCClsZmr2+v++Bv9i9A/G/h3v3WemBwVQlt9P28DwAXRhSH07KnQ4DBoR1UglDxhKF2AqHfCX8BJJybKLpFGF9ZJ9UW13k6C+M2cfwvLSUVWlK3Z4GhomCqOERT6ijDsyMHgJ8VoKHgdZnGMBUjgamUnPJw06jkUBPHdHz8PFmIdzo0s+0SBS5m6O8F8v+GVxqmu7N21q/ckl3tvE2Gxuna65V8s4307wn0MFlDSlomoH/4UitD1Z2OJ1dZ7j", 3 | "Payload": { 4 | "manifest": { 5 | "s3Path": "s3://paystubandw2-s3paystubw2e6324d42-jxb5y4ugc9o5/uploads/W2_5_reMars.png" 6 | }, 7 | "mime": "image/png", 8 | "classification": null, 9 | "numberOfPages": 1, 10 | "Random": { 11 | "randomNumber": 54 12 | }, 13 | "textract_result": { 14 | "TextractTempOutputJsonPath": "s3://paystubandw2-s3paystubw2e6324d42-jxb5y4ugc9o5/textract-temp-output/527320fd13b25c2a7c8a65ead65f23733befe028f369b44c17c60791d43808e6", 15 | "TextractOutputJsonPath": "s3://paystubandw2-s3paystubw2e6324d42-jxb5y4ugc9o5/textract-output/W2_5_reMars2022-07-06T19:46:33.419788/W2_5_reMars.json" 16 | }, 17 | "txt_output_location": { 18 | "TextractOutputCSVPath": "s3://paystubandw2-s3paystubw2e6324d42-jxb5y4ugc9o5/txt_output/2022-07-06T19:46:34+00:00/W2_5_reMars.txt" 19 | } 20 | }, 21 | "ExecutionId": "arn:aws:states:us-east-1:913165245630:execution:PaystubW2ABF3B728-NyjDY8QtG7Qc:W2_5_reMarspng2022-07-06T194617676172" 22 | } 23 | -------------------------------------------------------------------------------- /lambda/classification_spacy_image/events/simple-event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "S3Path": "s3://sdx-textract-us-east-1/employeeapp20210510.png" 4 | }, 5 | "mime": "image/png", 6 | "numberOfPages": 1 7 | } 8 | -------------------------------------------------------------------------------- /lambda/classification_spacy_image/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | SpacyFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | LOG_LEVEL: DEBUG 22 | Metadata: 23 | Dockerfile: Dockerfile 24 | DockerContext: . 25 | DockerTag: python3.9-v1 26 | 27 | -------------------------------------------------------------------------------- /lambda/classification_spacy_image/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/comprehend_sync/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | RUN python -m pip install amazon-textract-idp-cdk-manifest marshmallow --target "${LAMBDA_TASK_ROOT}" 5 | 6 | # Copy function code 7 | COPY app/* ${LAMBDA_TASK_ROOT}/ 8 | 9 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 10 | CMD [ "sync_main.lambda_handler" ] 11 | -------------------------------------------------------------------------------- /lambda/comprehend_sync/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/comprehend_sync/app/requirements.txt: -------------------------------------------------------------------------------- 1 | amazon-textract-caller 2 | -------------------------------------------------------------------------------- /lambda/comprehend_sync/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "HelloWorldFunction": { 3 | "S3_OUTPUT_PREFIX": "textract-output", 4 | "S3_OUTPUT_BUCKET": "schademcdkstackpaystubst-schademcdkidpstackpython-1fvi0dqoz24lj", 5 | "SQS_QUEUE_URL": "SchademCdkStackPaystubStack-textractsynctaskSyncRequestsBC26E72B-kID9dZUtmCZM" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /lambda/comprehend_sync/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "Records": [{ 3 | "messageId": "0a4d510e-c7ee-4ad4-95ca-991bd68f7094", 4 | "receiptHandle": "AQEBFAexgmSAITHX4YH6um/VgEB16q/pbqq6QY5yuOesYk6ieCpuWK1NAu4IQR8/r6gulmKreKJ5YZF940MGOvVkRYYfBNakTg7+cm5SOJzyxtIGYotZQrmj/YteauCsu+dLiWh4+4x0/qv71zbC3BMb5ed7WWf12IqgM7kuRPcO5ImPsR5x9XNfuQVAs0AlPBTsYADRxYyTT8dBWRCpKPjOSofxTk6z5zBFanax/cUEu2kl790Bor5HjkeCbfIeovA75g6QRf0llAk3YVzG1hS/cJqqdF51uSDWOV87/iB3C4Ef8dA6O9xmNkXroXgnTtLQBGme7nOnJ2HDvGjw09ZonvzChWmYHdFIC9YNUjV6sUsVFhG1GvfJ++31zMxbB9zFkogG7cH95wSrJSscjtMtb2uUeSHW33e+b5FprYa+YzFJd73fCNlFA+0er3PBmhmXi6dAqNOezIlAfu97YAqptnwm4Tn54l1JkLp1bt6ryOk=", 5 | "body": "{\"Token\": \"AQCYAAAAKgAAAAMAAAAAAAAAAcCPkWoecuOS78QC0lLFx2yIR7vdIN/AfqV25jZoUWxPX5eJFrJ83w03ml16IYFZ7YiPh7WP3PZQcHI+b/LVU4L/1FYj5X3E4zFwPjuc0ZLc2XrwgaKAd3tyPyAaLV9WN2EBxlwa77MZlebDhjznzDLGru5DUDVekJpb/wyRajbqgJ2FC4SNg9Uw3NgRHLM31Pm81VxmIpq3QMRx5h5OaRuTNtrrAjWlsfOebkx5r1wqRgAR9r5Y4WtHUBOQQM2p8iNpHoXTXGLrYX3EXJh8bMCS8/QZejdMR6cttvV4s014p3QBvp3e8PcxDsFvFp80wyMM9kJj+T15yJi93r/scXDwGt3OJ5ADRhKqh5OIO1UIl9dhBJNR8+FCGOtd3B8/GpdF0kur8Aqhe+iWnM+W/B1nAtKVSz3L20DkhOSzZUZ9esmUYai31PS7MwxDZqHR7v4qKPSu5MVJpedCVoiBlUYLTwW3tl26e2iZGHP0tTsNJADSFZwB9b1H6Gr7j7Uhih3zOtlexC6HsbK7vvuPshSIySf0Dy1bKvLzDMSeJrMI6B8MW+TLqNP9JmjelMZGjpwTI7Q0Pf5NvKie7br5JrPSRFqEZuX1cVPJW/mgOyHCteE52KPaeMDiH9ZxZTJQxa4XpImH71kmjt1A8muXmbGw\", \"Payload\": {\"manifest\": {\"s3Path\": \"s3://schademcdkstackpaystubst-schademcdkidpstackpython-1fvi0dqoz24lj/uploads/w2-example.png\"}}, \"ExecutionId\": \"arn:aws:states:us-east-1:913165245630:execution:PaystubWorkflowPython6780DE1E-MbkSvRTQL8tZ:9f19f384-91cc-4c9c-ba0b-dfb987349df4\"}", 6 | "attributes": { 7 | "ApproximateReceiveCount": "7", 8 | "SentTimestamp": "1654192484978", 9 | "SenderId": "AROA5JHHD3S7C7EVBI2MB:SchademCdkStackPaystubSta-textractsynctaskPutOnSQS-pweSVBLajnrH", 10 | "ApproximateFirstReceiveTimestamp": "1654192484978" 11 | }, 12 | "messageAttributes": {}, 13 | "md5OfBody": "7b73daf59eafd8a4dea1945bae6914aa", 14 | "eventSource": "aws:sqs", 15 | "eventSourceARN": "arn:aws:sqs:us-east-1:913165245630:SchademCdkStackPaystubStack-textractsynctaskSyncRequestsBC26E72B-kID9dZUtmCZM", 16 | "awsRegion": "us-east-1" 17 | }] 18 | } 19 | -------------------------------------------------------------------------------- /lambda/comprehend_sync/events/simple-event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "S3Path": "s3://sdx-textract-us-east-1/employeeapp20210510.png" 4 | }, 5 | "mime": "image/png", 6 | "numberOfPages": 1 7 | } 8 | -------------------------------------------------------------------------------- /lambda/comprehend_sync/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | SyncFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | S3_OUTPUT_PREFIX: textract-output 22 | S3_OUTPUT_BUCKET: schademcdkstackpaystubst-schademcdkidpstackpython-1fvi0dqoz24lj 23 | SQS_QUEUE_URL: SchademCdkStackPaystubStack-textractsynctaskSyncRequestsBC26E72B-kID9dZUtmCZM 24 | Metadata: 25 | Dockerfile: Dockerfile 26 | DockerContext: . 27 | DockerTag: python3.9-v1 28 | 29 | -------------------------------------------------------------------------------- /lambda/comprehend_sync/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/configurator/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | RUN python -m pip install schadem-tidp-manifest marshmallow --target "${LAMBDA_TASK_ROOT}" 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/configurator/app/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import logging 6 | import os 7 | import boto3 8 | import json 9 | import textractmanifest as tm 10 | 11 | logger = logging.getLogger(__name__) 12 | version = "0.0.13" 13 | 14 | dynamodb = boto3.resource('dynamodb') 15 | 16 | 17 | def lambda_handler(event, _): 18 | log_level = os.environ.get('LOG_LEVEL', 'INFO') 19 | table_name = os.environ.get('CONFIGURATION_TABLE') 20 | 21 | logger.setLevel(log_level) 22 | logger.info(f"version: {version}") 23 | logger.info(f"amazon-textract-idp-cdk-manifest version: {tm.__version__}") 24 | logger.info(f"table_name: {table_name}") 25 | logger.info(json.dumps(event)) 26 | if 'classification' in event and 'documentType' in event['classification']: 27 | document_type = event['classification']['documentType'] 28 | logger.debug(f"document_type: {document_type}") 29 | else: 30 | raise ValueError( 31 | f'no [classification][documentType] given in event: {event}') 32 | 33 | table = dynamodb.Table(table_name) #pyright: ignore 34 | 35 | ddb_response = table.get_item(Key={"DOCUMENT_TYPE": document_type}) 36 | logger.debug(f"ddb_response: {ddb_response}") 37 | input_manifest: tm.IDPManifest = tm.IDPManifestSchema().load( 38 | event['manifest']) #type: ignore 39 | 40 | if 'Item' in ddb_response and 'CONFIG' in ddb_response['Item']: 41 | configuration_manifest: tm.IDPManifest = tm.IDPManifestSchema().loads( 42 | ddb_response['Item']['CONFIG']) #type: ignore 43 | input_manifest.merge(configuration_manifest) 44 | if configuration_manifest and configuration_manifest.queries_config: 45 | event['numberOfQueries'] = len( 46 | configuration_manifest.queries_config) 47 | logger.debug(f"merged manifest: {input_manifest}") 48 | event['manifest'] = tm.IDPManifestSchema().dump(input_manifest) 49 | else: 50 | logger.warning("no config found") 51 | return event 52 | -------------------------------------------------------------------------------- /lambda/configurator/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "DeciderFunction": { 3 | "CONFIGURATION_TABLE": "RM-PaystubW2configuratorTextractConfigurationTableD0BAEF8E-1AG0QF51SQ4GZ" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /lambda/configurator/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "s3Path": "s3://rm-schademcdkidpstackpaystubw2b23e1d7e-pb6eve1ca8yk/uploads/W2_1_reMars.png" 4 | }, 5 | "mime": "image/png", 6 | "classification": { 7 | "documentType": "AWS_W2" 8 | }, 9 | "numberOfPages": 1, 10 | "Random": { 11 | "randomNumber": 5 12 | }, 13 | "textract_result": { 14 | "TextractOutputJsonPath": "s3://rm-schademcdkidpstackpaystubw2b23e1d7e-pb6eve1ca8yk/textract-output/W2_1_reMars2022-06-21T00:46:07.656503/W2_1_reMars.json" 15 | }, 16 | "txt_output_location": { 17 | "TextractOutputCSVPath": "s3://rm-schademcdkidpstackpaystubw2b23e1d7e-pb6eve1ca8yk/txt_output/2022-06-21T00:46:09+00:00/W2_1_reMars.txt" 18 | } 19 | } -------------------------------------------------------------------------------- /lambda/configurator/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | DeciderFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | CONFIGURATION_TABLE: "RM-PaystubW2configuratorTextractConfigurationTableD0BAEF8E-1AG0QF51SQ4GZ" 22 | Metadata: 23 | Dockerfile: Dockerfile 24 | DockerContext: . 25 | DockerTag: python3.9-v1 26 | 27 | -------------------------------------------------------------------------------- /lambda/configurator/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/csv_to_aurora/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | 4 | # Copy function code 5 | COPY app/* ${LAMBDA_TASK_ROOT}/ 6 | 7 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 8 | CMD [ "main.lambda_handler" ] 9 | -------------------------------------------------------------------------------- /lambda/csv_to_aurora/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/csv_to_aurora/app/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/csv_to_aurora/app/requirements.txt -------------------------------------------------------------------------------- /lambda/csv_to_aurora/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "HelloWorldFunction": { 3 | "SECRET_ARN": "somearn", 4 | "CLUSTER_ARN": "somearn", 5 | "LOG_LEVEL": "DEBUG", 6 | "SQS_QUEUE_URL": "somequeue" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /lambda/csv_to_aurora/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | SyncFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | SECRET_ARN: "somearn" 22 | CLUSTER_ARN: "somearn" 23 | LOG_LEVEL: "DEBUG" 24 | SQS_QUEUE_URL: "somequeue" 25 | Metadata: 26 | Dockerfile: Dockerfile 27 | DockerContext: . 28 | DockerTag: python3.9-v1 29 | 30 | -------------------------------------------------------------------------------- /lambda/csv_to_aurora/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/decider/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | RUN python -m pip install pypdf[full] Pillow filetype amazon-textract-idp-cdk-manifest marshmallow --target "${LAMBDA_TASK_ROOT}" 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "decider_main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/decider/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/decider/app/requirements.txt: -------------------------------------------------------------------------------- 1 | filetype 2 | pypdf[full] 3 | Pillow -------------------------------------------------------------------------------- /lambda/decider/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "DeciderFunction": { 3 | "S3_OUTPUT_BUCKET": "my-stack-dev-documentbucket04c71448-7en8gx904sk5", 4 | "S3_OUTPUT_PREFIX": "textract-output" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /lambda/decider/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://idp-stack-python-sample-schademcdkidpstackpython-pha5ty3i0jbi/uploads/employmentapp.png" 3 | } 4 | -------------------------------------------------------------------------------- /lambda/decider/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | DeciderFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | S3_OUTPUT_PREFIX: textract-output 22 | S3_OUTPUT_BUCKET: my-stack-dev-documentbucket04c71448-7en8gx904sk5 23 | Metadata: 24 | Dockerfile: Dockerfile 25 | DockerContext: . 26 | DockerTag: python3.9-v1 27 | 28 | -------------------------------------------------------------------------------- /lambda/decider/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/decider/tests/data/sample_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queriesConfig": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": "[*]" 12 | }], 13 | "classification": "APPLICATION", 14 | } 15 | -------------------------------------------------------------------------------- /lambda/decider/tests/data/simple_feature_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queries_config": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": "[*]" 12 | }] 13 | } 14 | -------------------------------------------------------------------------------- /lambda/decider/tests/test_decider.py: -------------------------------------------------------------------------------- 1 | import json 2 | import io 3 | import os 4 | import boto3 5 | 6 | current_folder = os.path.dirname(os.path.realpath(__file__)) 7 | 8 | 9 | def test_serializer_manifest(caplog): 10 | s3_bucket = 'sdx-textract-us-east-1' 11 | s3_key = 'sample_manifest.json' 12 | s3_client = boto3.client('s3') 13 | o = s3_client.get_object(Bucket=s3_bucket, Key=s3_key) 14 | file_content = o.get('Body').read().decode('utf-8') 15 | json_content = json.loads(file_content) 16 | assert json_content 17 | assert json_content['s3Path'] 18 | assert json_content['textractFeatures'] 19 | assert len(json_content['textractFeatures']) == 3 20 | -------------------------------------------------------------------------------- /lambda/document_splitter/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | RUN python -m pip install pypdf==3.12.1 Pillow filetype amazon-textract-idp-cdk-manifest marshmallow --target "${LAMBDA_TASK_ROOT}" 5 | 6 | # Copy function code 7 | COPY app/ ${LAMBDA_TASK_ROOT}/ 8 | 9 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 10 | CMD [ "main.lambda_handler" ] 11 | -------------------------------------------------------------------------------- /lambda/document_splitter/app/documentsplitter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/document_splitter/app/documentsplitter/__init__.py -------------------------------------------------------------------------------- /lambda/document_splitter/app/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import logging 6 | import os 7 | import boto3 8 | from datetime import datetime 9 | import json 10 | import textractmanifest as tm 11 | import filetype 12 | from documentsplitter.documentsplitter import split_and_save_pages, split_s3_path_to_bucket_and_key 13 | from typing import Tuple, Optional 14 | 15 | s3_client = boto3.client('s3') 16 | 17 | logger = logging.getLogger(__name__) 18 | version = "0.0.14" 19 | 20 | 21 | def split_s3_path_to_bucket_and_key(s3_path: str) -> Tuple[str, str]: 22 | if len(s3_path) > 7 and s3_path.lower().startswith("s3://"): 23 | s3_bucket, s3_key = s3_path.replace("s3://", "").split("/", 1) 24 | return (s3_bucket, s3_key) 25 | else: 26 | raise ValueError( 27 | f"s3_path: {s3_path} is no s3_path in the form of s3://bucket/key." 28 | ) 29 | 30 | 31 | def get_file_from_s3(s3_path: str, range=None) -> bytes: 32 | s3_bucket, s3_key = split_s3_path_to_bucket_and_key(s3_path) 33 | if range: 34 | o = s3_client.get_object(Bucket=s3_bucket, Key=s3_key, Range=range) 35 | else: 36 | o = s3_client.get_object(Bucket=s3_bucket, Key=s3_key) 37 | return o.get('Body').read() 38 | 39 | 40 | def get_mime_for_file(file_bytes: bytes) -> Optional[str]: 41 | """ 42 | possible formats: image/tiff, image/jpeg, application/pdf, image/png or 43 | """ 44 | kind = filetype.guess(file_bytes) 45 | if kind is None: 46 | return None 47 | else: 48 | return kind.mime 49 | 50 | 51 | def parse_manifest(s3_path: str) -> tm.IDPManifest: 52 | s3_bucket, s3_key = split_s3_path_to_bucket_and_key(s3_path) 53 | o = s3_client.get_object(Bucket=s3_bucket, Key=s3_key) 54 | file_content = o.get('Body').read().decode('utf-8') 55 | return tm.IDPManifestSchema().loads(file_content) #type: ignore 56 | 57 | 58 | def lambda_handler(event, _): 59 | # Accepts a manifest file, with an s3Path and will split the document into individual pages 60 | 61 | log_level = os.environ.get('LOG_LEVEL', 'INFO') 62 | logger.setLevel(log_level) 63 | logger.info(f"version: {version}") 64 | logger.info(f"amazon-textract-idp-cdk-manifest: {tm.__version__}") 65 | logger.info(json.dumps(event)) 66 | 67 | s3_output_bucket = os.environ.get('S3_OUTPUT_BUCKET', None) 68 | if not s3_output_bucket: 69 | raise Exception("no S3_OUTPUT_BUCKET set") 70 | 71 | s3_output_prefix = os.environ.get('S3_OUTPUT_PREFIX', None) 72 | if not s3_output_prefix: 73 | raise Exception("no S3_OUTPUT_PREFIX set") 74 | 75 | max_number_of_pages_per_doc = int( 76 | os.environ.get('MAX_NUMBER_OF_PAGES_PER_DOC', "1")) 77 | 78 | logger.debug(f"S3_OUTPUT_BUCKET: {s3_output_bucket} \ 79 | S3_OUTPUT_PREFIX: {s3_output_prefix} \ 80 | MAX_NUMBER_OF_PAGES_PER_DOC: {max_number_of_pages_per_doc}") 81 | 82 | supported_mime_types = [ 83 | 'application/pdf', 'image/png', 'image/jpeg', 'image/tiff' 84 | ] 85 | 86 | if 'manifest' in event: 87 | manifest: tm.IDPManifest = tm.IDPManifestSchema().load( 88 | event['manifest']) #type: ignore 89 | else: 90 | manifest: tm.IDPManifest = tm.IDPManifestSchema().load( 91 | event) #type: ignore 92 | 93 | s3_path = manifest.s3_path 94 | 95 | if 'mime' in event: 96 | mime = event['mime'] 97 | else: 98 | first_file_bytes = get_file_from_s3(s3_path=s3_path, 99 | range='bytes=0-2000') 100 | mime = get_mime_for_file(file_bytes=first_file_bytes) 101 | 102 | if mime and mime in supported_mime_types: 103 | timestamp = datetime.utcnow().isoformat() 104 | s3_filename, _ = os.path.splitext(os.path.basename(manifest.s3_path)) 105 | full_output_prefix = os.path.join(s3_output_prefix, s3_filename, 106 | timestamp) 107 | output_file_list = split_and_save_pages( 108 | s3_path=s3_path, 109 | mime=mime, 110 | s3_output_bucket=s3_output_bucket, 111 | s3_output_prefix=full_output_prefix, 112 | max_number_of_pages=max_number_of_pages_per_doc) 113 | else: 114 | raise Exception(f"not supported Mime type: {mime}") 115 | logger.info(f"return: {manifest}") 116 | 117 | result_value = { 118 | "documentSplitterS3OutputPath": full_output_prefix, 119 | "documentSplitterS3OutputBucket": s3_output_bucket, 120 | "pages": output_file_list, 121 | "mime": mime, 122 | "originFileURI": manifest.s3_path 123 | } 124 | 125 | return result_value 126 | -------------------------------------------------------------------------------- /lambda/document_splitter/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "Function": { 3 | "S3_OUTPUT_BUCKET": "sdx-textract-us-east-1", 4 | "S3_OUTPUT_PREFIX": "textract-splitter-output" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /lambda/document_splitter/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "metaData": [{ 4 | "value": "0", 5 | "key": "ddi" 6 | }, { 7 | "value": "AQEBMZzjJG8C4jOPq3OqhyCRRj3YT4NNZdNz3saCcpetCGnODfyePIdegyVuqSRgNzaJqQV4pxdDqh3uJIRUHOzAd/OAllW8We9T061bRHao66p6iXnPQ8vr1zcnQLp67s279Ef4wqh1k3/Ia8p1YuxkCJqizAaaCiUGegYKQovOl5AfEgmZSGWBUzSM/KTcHAyBwgX2M/AcdhwmORFSE1W5M9lLCUCdm0AJHR4z+VJxTqaxRNTV4a6p9Z/ykNML2surJGakJCcTW73Z3LVD3EhEripNcw2s775lc4rW1emylovTOOskz7WQbLM1728iOMRalxYXE3ZqnqIZGu67xJa+mHcT7AOD0ATeLxEVrU6PbI0=", 8 | "key": "ReceiptHandle" 9 | }], 10 | "s3Path": "s3://opensearchworkflow-opensearchworkflowbucket68e805-o3e1vk4h56b1/uploads/9709/hep-ph9709240.pdf" 11 | }, 12 | "mime": "application/pdf", 13 | "classification": null, 14 | "numberOfPages": 17, 15 | "fileSize": 470116 16 | } 17 | -------------------------------------------------------------------------------- /lambda/document_splitter/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | DeciderFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | S3_OUTPUT_PREFIX: textract-output 22 | S3_OUTPUT_BUCKET: sdx-textract-us-east-1 23 | Metadata: 24 | Dockerfile: Dockerfile 25 | DockerContext: . 26 | DockerTag: python3.9-v1 27 | 28 | -------------------------------------------------------------------------------- /lambda/document_splitter/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/document_splitter/tests/data/sample_210_page_pdf.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png" 3 | } 4 | -------------------------------------------------------------------------------- /lambda/document_splitter/tests/test_document_splitter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import logging 4 | 5 | sys.path.append( 6 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "../app")) 7 | 8 | from documentsplitter.documentsplitter import split_and_save_pages #pyright: ignore 9 | 10 | current_folder = os.path.dirname(os.path.realpath(__file__)) 11 | 12 | 13 | def test_splitter_multi_page_pdf(caplog): 14 | caplog.set_level(logging.DEBUG) 15 | caplog.set_level(logging.WARNING, logger='botocore') 16 | caplog.set_level(logging.WARNING, logger='boto3') 17 | 18 | s3_path = 's3://sdx-textract-us-east-1/50-page-book.pdf' 19 | page_list = split_and_save_pages(s3_path=s3_path, 20 | mime='application/pdf', 21 | s3_output_bucket='sdx-textract-us-east-1', 22 | s3_output_prefix='document_splitter_test') 23 | assert page_list 24 | assert len(page_list) == 51 25 | 26 | def test_splitter_201_page_pdf(caplog): 27 | caplog.set_level(logging.DEBUG) 28 | caplog.set_level(logging.WARNING, logger='botocore') 29 | caplog.set_level(logging.WARNING, logger='boto3') 30 | 31 | s3_path = 's3://sdx-textract-us-east-1/210-page-book.pdf' 32 | page_list = split_and_save_pages(s3_path=s3_path, 33 | mime='application/pdf', 34 | s3_output_bucket='sdx-textract-us-east-1', 35 | s3_output_prefix='document_splitter_test_210') 36 | assert page_list 37 | assert len(page_list) == 226 38 | 39 | def test_splitter_multi_page_tiff(caplog): 40 | caplog.set_level(logging.DEBUG) 41 | caplog.set_level(logging.WARNING, logger='botocore') 42 | caplog.set_level(logging.WARNING, logger='boto3') 43 | 44 | s3_path = 's3://sdx-textract-us-east-1/multi_page_tiff.tiff' 45 | page_list = split_and_save_pages(s3_path=s3_path, 46 | mime='image/tiff', 47 | s3_output_bucket='sdx-textract-us-east-1', 48 | s3_output_prefix='document_splitter_test') 49 | assert page_list 50 | assert len(page_list) == 2 51 | 52 | 53 | def test_splitter_single_page_tiff(caplog): 54 | caplog.set_level(logging.DEBUG) 55 | caplog.set_level(logging.WARNING, logger='botocore') 56 | caplog.set_level(logging.WARNING, logger='boto3') 57 | 58 | s3_path = 's3://sdx-textract-us-east-1/employmentapp.tiff' 59 | page_list = split_and_save_pages(s3_path=s3_path, 60 | mime='image/tiff', 61 | s3_output_bucket='sdx-textract-us-east-1', 62 | s3_output_prefix='document_splitter_test') 63 | assert page_list 64 | assert len(page_list) == 1 65 | 66 | 67 | def test_splitter_single_page_jpeg(caplog): 68 | caplog.set_level(logging.DEBUG) 69 | caplog.set_level(logging.WARNING, logger='botocore') 70 | caplog.set_level(logging.WARNING, logger='boto3') 71 | 72 | s3_path = 's3://sdx-textract-us-east-1/w2-example.jpeg' 73 | page_list = split_and_save_pages(s3_path=s3_path, 74 | mime='image/jpeg', 75 | s3_output_bucket='sdx-textract-us-east-1', 76 | s3_output_prefix='document_splitter_test') 77 | assert page_list 78 | assert len(page_list) == 1 79 | 80 | 81 | def test_splitter_single_page_png(caplog): 82 | caplog.set_level(logging.DEBUG) 83 | caplog.set_level(logging.WARNING, logger='botocore') 84 | caplog.set_level(logging.WARNING, logger='boto3') 85 | 86 | s3_path = 's3://sdx-textract-us-east-1/w2-example.png' 87 | page_list = split_and_save_pages(s3_path=s3_path, 88 | mime='image/png', 89 | s3_output_bucket='sdx-textract-us-east-1', 90 | s3_output_prefix='document_splitter_test') 91 | assert page_list 92 | assert len(page_list) == 1 93 | -------------------------------------------------------------------------------- /lambda/executions_queue_worker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | RUN python -m pip install amazon-textract-idp-cdk-manifest pynamodb marshmallow --target "${LAMBDA_TASK_ROOT}" 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/executions_start_throttle/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | RUN python -m pip install amazon-textract-idp-cdk-manifest pynamodb marshmallow --target "${LAMBDA_TASK_ROOT}" 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "start_execution.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/executions_throttle_counter_reset/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | RUN python -m pip install amazon-textract-idp-cdk-manifest pynamodb marshmallow --target "${LAMBDA_TASK_ROOT}" 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/executions_throttle_counter_reset/app/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | kicks off Step Function executions 3 | """ 4 | import json 5 | import logging 6 | import os 7 | from pynamodb.models import Model 8 | from pynamodb.attributes import UnicodeAttribute, NumberAttribute 9 | 10 | import boto3 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | step_functions_client = boto3.client(service_name='stepfunctions') 15 | 16 | 17 | class ExecutionsCounterModel(Model): 18 | 19 | class Meta: 20 | table_name = os.environ['EXECUTIONS_COUNTER_TABLE'] 21 | region = boto3.Session().region_name 22 | 23 | name = UnicodeAttribute(hash_key=True, attr_name='n') 24 | execution_count = NumberAttribute(attr_name="c", default=0) 25 | 26 | 27 | def get_number_of_executions(state_machine_arn) -> int: 28 | total_number_of_executions = 0 29 | next_token = None 30 | next = True 31 | while next: 32 | if next_token: 33 | response = step_functions_client.list_executions( 34 | stateMachineArn=state_machine_arn, 35 | nextToken=next_token, 36 | statusFilter="RUNNING" 37 | ) # List all executions for the state machine 38 | else: 39 | response = step_functions_client.list_executions( 40 | stateMachineArn=state_machine_arn, statusFilter="RUNNING") 41 | 42 | # Retrieve "numberOfPages" output for each execution 43 | total_number_of_executions += len(response['executions']) 44 | 45 | if 'nextToken' in response: 46 | next_token = response['nextToken'] 47 | else: 48 | next = False 49 | return total_number_of_executions 50 | 51 | 52 | def lambda_handler(event, _): 53 | """This Lambda gets the current RUNNING workflows and updates the 54 | ExecutionsCounter table with that number. If we don't do that there is a risk that over time the counter is out of sync with the actual number. The process does not have to be super accurate rgdg the number, but should also not be too far off.""" 55 | # GET ENVIRONMENT VARIABLES 56 | log_level = os.environ.get('LOG_LEVEL', 'INFO') 57 | logger.setLevel(log_level) 58 | logger.info(f"LOG_LEVEL: {log_level}") 59 | logger.info(json.dumps(event)) 60 | 61 | state_machine_arn = os.environ.get('STATE_MACHINE_ARN', None) 62 | if not state_machine_arn: 63 | raise Exception("no STATE_MACHINE_ARN set") 64 | logger.info(f"STATE_MACHINE_ARN: {state_machine_arn}") 65 | 66 | number_of_executions = get_number_of_executions( 67 | state_machine_arn=state_machine_arn) 68 | try: 69 | executions_counter_model = ExecutionsCounterModel.get( 70 | state_machine_arn) 71 | except ExecutionsCounterModel.DoesNotExist: 72 | executions_counter_model = ExecutionsCounterModel(state_machine_arn) 73 | executions_counter_model.update(actions=[ 74 | ExecutionsCounterModel.execution_count.set(number_of_executions) 75 | ]) 76 | -------------------------------------------------------------------------------- /lambda/generatecsv/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | RUN python -m pip install amazon-textract-idp-cdk-manifest marshmallow amazon-textract-textractor amazon-textract-response-parser amazon-textract-prettyprinter --target "${LAMBDA_TASK_ROOT}" 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/generatecsv/envs/env-bak.json: -------------------------------------------------------------------------------- 1 | { 2 | "GenerateCSVFunction": { 3 | "CSV_S3_OUTPUT_BUCKET": "generatecsvworkflow-textractsimpleasyncworkflow2d-5za69kjbyg6r", 4 | "CSV_S3_OUTPUT_PREFIX": "textract-csv-output", 5 | "LOG_LEVEL": "DEBUG", 6 | "OUTPUT_TYPE": "CSV", 7 | "OUTPUT_FEATURES": "TABLES,FORMS,SIGNATURES", 8 | "TEXTRACT_API": "GENERIC" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /lambda/generatecsv/envs/env-meta-data-lending.json: -------------------------------------------------------------------------------- 1 | { 2 | "Function": { 3 | "CSV_S3_OUTPUT_BUCKET": "lendingworkflow-textractsimplesyncworkflow5c83a6b-1ieddf0hrnpxc", 4 | "CSV_S3_OUTPUT_PREFIX": "textract-csv-output", 5 | "LOG_LEVEL": "DEBUG", 6 | "OUTPUT_TYPE": "CSV", 7 | "TEXTRACT_API": "LENDING", 8 | "META_DATA_TO_APPEND": "DOCUMENT_ID" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /lambda/generatecsv/envs/env-no-meta-lending.json: -------------------------------------------------------------------------------- 1 | { 2 | "Function": { 3 | "CSV_S3_OUTPUT_BUCKET": "lendingworkflow-textractsimplesyncworkflow5c83a6b-1ieddf0hrnpxc", 4 | "CSV_S3_OUTPUT_PREFIX": "textract-csv-output", 5 | "LOG_LEVEL": "DEBUG", 6 | "OUTPUT_TYPE": "CSV", 7 | "TEXTRACT_API": "LENDING", 8 | "META_DATA_TO_APPEND": "" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /lambda/generatecsv/envs/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "Function": { 3 | "CSV_S3_OUTPUT_BUCKET": "sdx-textract-us-east-1", 4 | "CSV_S3_OUTPUT_PREFIX": "textract-linearizer-output", 5 | "LOG_LEVEL": "DEBUG", 6 | "OUTPUT_TYPE": "LINEARIZED", 7 | "TEXTRACT_API": "GENERIC", 8 | "META_DATA_TO_APPEND": "" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /lambda/generatecsv/envs/linearizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "Function": { 3 | "CSV_S3_OUTPUT_BUCKET": "sdx-textract-us-east-1", 4 | "CSV_S3_OUTPUT_PREFIX": "textract-linearizer-output", 5 | "LOG_LEVEL": "DEBUG", 6 | "OUTPUT_TYPE": "LINEARIZED", 7 | "TEXTRACT_API": "GENERIC", 8 | "META_DATA_TO_APPEND": "" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /lambda/generatecsv/events/event-linearizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "Token": "AQCEAAAAKgAAAAMAAAAAAAAAAW8XUJ9AL4pq0yByewwTwQxZZ0QbQ5xkGoZYM9YdWY0XeUmfCJXY8qGJlhAhvuwkvwOWHSlTv6zWIAwHv5igqE0yVQPRI37JbkMDXh8Jnqi/aKGc0+HBHFPSgWOkpAMVf6MhvetrsgsKAqOikDKi24TlQK+qDck6E9YwnNG2aOy3XJ5WpA830ryMacbk4IfBsgQUbbfOSPanQ31osR1NOtsIHqStmGGAkTu7ITAK0l+54asl01sBt6sOcyKWl5w+mn8CVd3jO9EmU8NBsjT+0qaN2cw/ChFj6QJhwp3b/deja70tGm+F2RgWfEwZTLyhVsBMR0T/X4RsVtNyM7IzYLOJzRgwGZRpm7PGwXZCiLJTxavUFK1Cmy7n6c9AK3DS/DHPF9F1mHqa+IRMFZyC/KttCxDxT6uVVKcy5QsLpSUzSC7gKoiRXV8JNhcbtmFP8zmh4tk6Mcmemq6gfFhYhSveJakan4XzMSiLW/3rcXbqX0Pffg59YGYJLGwbiTrg/te9qEnIUhdniWYZEqqO4AEfyny7WumxXe5i3aN2YKPUGH8uaZSLF/X8u4QDja68SahiaMTN7HyqK5LIukabadrCsdBfnkC4cnMjOtsizCbOKZa7FMVwtQYHim0RsR4xRsS4", 3 | "Payload": { 4 | "manifest": { 5 | "s3Path": "s3://sdx-textract-us-east-1/uploads/sample_page-14-file.png", 6 | "textractFeatures": [ 7 | "QUERIES", 8 | "FORMS", 9 | "SIGNATURES", 10 | "TABLES", 11 | "LAYOUT" 12 | ], 13 | "queriesConfig": [ 14 | { "text": "What is the address?", "pages": ["*"], "alias": "ADDRESS" }, 15 | { "text": "What is the name?", "pages": ["*"], "alias": "NAME" } 16 | ] 17 | }, 18 | "mime": "image/png", 19 | "classification": null, 20 | "numberOfPages": 1, 21 | "fileSize": 77338, 22 | "numberOfQueries": 2, 23 | "textract_result": { 24 | "TextractTempOutputJsonPath": "s3://sdx-textract-us-east-1/textract-temp-output/b539dde4cdaa56cf71804eb8939723ae17ff5baf9421dd9003f034064a2c2539", 25 | "TextractOutputJsonPath": "s3://sdx-textract-us-east-1/textract-output/68e57ac7-8441-413c-90d1-b868fbe64263/sample_page-14-file.json" 26 | } 27 | }, 28 | "ExecutionId": "arn:aws:states:us-east-1:913165245630:execution:AIM30620038C3B-CqhgnC8p3gEM:sample_page-14-filepng2023-10-26T133044538669" 29 | } 30 | -------------------------------------------------------------------------------- /lambda/generatecsv/events/event-meta-lending.json: -------------------------------------------------------------------------------- 1 | { 2 | "Token": "AQCQAAAAKgAAAAMAAAAAAAAAAdneV23bj2R21/LWiG+zCa3j3rBLNgIDWJNn5KA4AKRaQcrmFdts9pmNM5zbEF8jCd57UFLQUb7YRVBitULL9FnKNMehsa0hD9j0iL2HD8OOW7lw96zBewW+YmwSjIRhZB5NBUA8v4i22Q6YfuX/K+gT1k5MibBaH248ITFyqCb3aehQDStMdqI+lGs+Wk1B2SZCnrsB1Bzx8GmzIAPJJRw/TWp4E5lriT5sc26sodvERpSrhSWx0fbompLJKc9F1waCn0Wyus7ZJKiFkjaJKCR688FFiL69/3dATcKRecWbzJhfIsGzSWVe+PQEfQ6BtfSCqOsYDDSA5pryTH52qn7ZrI2F4CUxVkwFQ+2YqFbu2Nxp9zryyxmb4GaGyne5LsA1g8JNVk5bMZBeuuqW2bc9UIawVTgPKtJjIZnsq89jZiGyTOvuSItts6yIXmPbh6sLp7QK3AGFxxPIYtAGtHQwZPXacz+gD3OAUPit5RspT9HpUonbS+Rf5IWdZMWwrp2NOgQN6aBZfhazWYJHQcWRHp287DOJARHSQ6yoqS2GkxgiQbyTTrU0hXs/+H7t9n9F+lF0qKgCdNhukBwBzw/h3ig9H40t53AlKyJNy0rQlpwIf2x8bQpj1w2EyDOsT5VzeTGwxHycij5I", 3 | "Payload": { 4 | "manifest": { 5 | "s3Path": "s3://lendingworkflow-textractsimplesyncworkflow5c83a6b-1ieddf0hrnpxc/uploads/lending_console_demo_with_contacts copy.pdf", 6 | "metaData": [{ 7 | "value": "lending_console_demo_with_contacts copy", 8 | "key": "DOCUMENT_ID" 9 | }] 10 | }, 11 | "mime": "application/pdf", 12 | "classification": null, 13 | "numberOfPages": 8, 14 | "textract_result": { 15 | "TextractTempOutputJsonPath": "s3://lendingworkflow-textractsimplesyncworkflow5c83a6b-1ieddf0hrnpxc/textract-temp-output/4e6e18bd643f23acfc771dba396121079be19dbc357ad2741159ae4143a1e025", 16 | "TextractOutputJsonPath": "s3://lendingworkflow-textractsimplesyncworkflow5c83a6b-1ieddf0hrnpxc/textract-output/lending_console_demo_with_contacts copy2022-12-30T21:57:56.591963/lending_console_demo_with_contacts copy.json" 17 | } 18 | }, 19 | "ExecutionId": "arn:aws:states:us-east-1:913165245630:execution:LendingWorkflow109AF6E5-Z3ztsLpGPLXA:lending_console_demo_with_contactscopypdf2022-12-30T215736388516" 20 | } 21 | -------------------------------------------------------------------------------- /lambda/generatecsv/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "Token": "AQCYAAAAKgAAAAMAAAAAAAAAAYr5arqMuyfHoY6pmbwpOw9LP3sGWJ2722fNgxa5PpUj4B4OFjoNo1BDJRiX6FZb3RwD1qQxtAIeDFGGg+yiXZZIxDb4mbLPXNY3+4yZEmV+xzIMT+lbUMl5yPHGR5VdaGuSHda6IUHNDJRiJZXcfcLk51tHp5/YFPgvzFc2zA4FYcvJSPsAPWYXjujp0+6tr+YL8cYKUdX3ogg+NkGugLSpb2CshBv1diH32OixFOFrsl1wgvWqewL+ZhpZRbPDvGNX2m+78z6GcBqHjMqS6WWbl8+uKgLqOZoFr3dAYWAUEcSoKMWgQn1XhRJSuHvmlngBLyAXn5lwgLUdAvwpq/rAT56tHAigRp4Xh5qNUE8o9jX1dsHsyzcx/fk6ztEtyZr46KZkheUpLrYHVAfB0bOCMx5ebBwMl8dC5bjaGxpOZY4yXL+5S1xq0+KuhydQrZ3hCZ7CA91G4prmGfXfxcrrr0frTTUQc3U+lBzUYhyu/Gex317Gs/qXg4YgqgoGKfJAY6AcdCxzM+2xRiD5zzYc/FZIsK8Eb2YO0q/g8RD0K+Jfd8SvYgxf3QbEZscEe9PuTuCcRvIGpDAAam70gTBAzQnEuSUOlLfznEma3aGIq9QLqz4NLR0zVj0rmzBAQaEMxN+zMXOptri734BDpbf+", 3 | "Payload": { 4 | "manifest": { 5 | "queriesConfig": [ 6 | { 7 | "alias": "PAYSTUB_PAY_PERIOD_START_DATE", 8 | "text": "What is the Pay Period Start Date?" 9 | }, 10 | { 11 | "alias": "PAYSTUB_PAY_PERIOD_END", 12 | "text": "What is the Pay Period End Date?" 13 | }, 14 | { 15 | "alias": "PAYSTUB_PAY_DATE", 16 | "text": "What is the Pay Date?" 17 | }, 18 | { 19 | "alias": "PAYSTUB_EMPLOYEE_NAME", 20 | "text": "What is the Employee Name?" 21 | }, { 22 | "alias": "PAYSTUB_EMPLOYEE_ADDRESS", 23 | "text": "What is the Employee Address?" 24 | }, { 25 | "alias": "PAYSTUB_COMPANY_NAME", 26 | "text": "What is the company Name?" 27 | }, { 28 | "alias": "PAYSTUB_CURRENT_GROSS", 29 | "text": "What is the Current Gross Pay?" 30 | }, { 31 | "alias": "PAYSTUB_YTD_GROSS", 32 | "text": "What is the YTD Gross Pay?" 33 | }, { 34 | "alias": "PAYSTUB_REGULAR_HOURLY_RATE", 35 | "text": "What is the regular hourly rate?" 36 | }, { 37 | "alias": "PAYSTUB_HOLIDAY_RATE", 38 | "text": "What is the holiday rate?" 39 | } 40 | ], 41 | "s3Path": "s3://schademcdkstackpaystubst-schademcdkidpstackpython-1fvi0dqoz24lj/uploads/w2-example.png", 42 | "textractFeatures": ["QUERIES"], 43 | "metaData": [{ 44 | "key": "DOCUMENT_ID", 45 | "value": "document_id_value_1" 46 | },{ 47 | "key": "meta_data_key_2", 48 | "value": "meta_data_value_2" 49 | }] 50 | }, 51 | "mime": "image/png", 52 | "classification": null, 53 | "numberOfPages": 1, 54 | "textract_result": { 55 | "TextractOutputJsonPath": "s3://schademcdkstackpaystubst-schademcdkidpstackpython-1fvi0dqoz24lj/textract-output/w2-example2022-06-09T18:01:06.973523/w2-example.json" 56 | } 57 | }, 58 | "ExecutionId": "arn:aws:states:us-east-1:913165245630:execution:PaystubWorkflowPython6780DE1E-MbkSvRTQL8tZ:f62e1132-6e31-4310-9d56-975245267b1c" 59 | } 60 | -------------------------------------------------------------------------------- /lambda/generatecsv/events/generate_csv_tables.json: -------------------------------------------------------------------------------- 1 | { 2 | "Token": "AQCYAAAAKgAAAAMAAAAAAAAAAXYRfnURt30xLbFyrPzvSu0F/MzTZPaB7OcE0oG9ZJbfkG5qJhCmLpoVVPvMae2np7t0hSAO16ApqotmGSQWITWblmpb71IlMaqrBtxnVVG/eSpsgtYlIkHIhGo5+nFaIQ==R8NjOTyDECmXkx62V/mR1DTl4i7weSfiqzZMauiwaOIPRuf19+ljL0cBdUlG/tx8IaFWpqBqt5lix2O8ff+8Ia6vuVdM413DGlrszM//+KaUZl3+rfeAQix5ttHOZTR99v/rq1HNxmnRwWZsM+JRqL5s8LNGiRvPZsOpNPlvSoLCTOlDRd3WOmq9FQf55WLb53Mjv4RC2R68kv6bZtq+1lf0sWFZGrDUN07HMAJYpNGVCPfM4OmrB9D0da57DN25HMLqNeV9fYOumddQaEQcFU6bw1RpnE+tQyYbfm4kGGOXrPPM6bZFk6hp/nkEyrBgecnKOAHUC6RrYHBcVfMh2hGLrdLzOGHFECuAdkG1/FuE+oYKAjnXD7YD2YUVxZisAKE6NvyRUqBWEOeOgwcY0a7V7WjJ2uW9RicFXgs8xNMQfRUBInoZRfhVZ2DGehz12Q/QIqWDRzoUdjnkZvkkEqiZOcyizXVpNxaJ/M+c+Nruu7l9TlD5PYnOW25F/LxcRiOV6AKBKzRaKeJRxI6W", 3 | "Payload": { 4 | "manifest": { 5 | "s3Path": "s3://generatecsvworkflow-textractsimpleasyncworkflow2d-5za69kjbyg6r/uploads/paystub-small.png" 6 | }, 7 | "mime": "image/png", 8 | "classification": null, 9 | "numberOfPages": 1, 10 | "textract_result": { 11 | "TextractTempOutputJsonPath": "s3://generatecsvworkflow-textractsimpleasyncworkflow2d-5za69kjbyg6r/textract-temp-output/d7badd34c3dd14350c20be65459f143fcf8398723f4e962aa63f07feb0f07cb2", 12 | "TextractOutputJsonPath": "s3://generatecsvworkflow-textractsimpleasyncworkflow2d-5za69kjbyg6r/textract-output/paystub-small2023-03-22T23:35:20.108256/paystub-small.json" 13 | } 14 | }, 15 | "ExecutionId": "arn:aws:states:us-east-1:913165245630:execution:GenerateCSVWorkflowF9F71ED0-uV6wzLRfulLT:paystub-smallpng2023-03-22T233507050420" 16 | } 17 | -------------------------------------------------------------------------------- /lambda/generatecsv/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | GenerateCSVFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | ImageUri: ./ 18 | Architectures: 19 | - x86_64 20 | Environment: 21 | Variables: 22 | CSV_S3_OUTPUT_BUCKET: "sdx-textract-us-east-1" 23 | CSV_S3_OUTPUT_PREFIX: "textract-linearized-output" 24 | LOG_LEVEL: DEBUG 25 | META_DATA_TO_APPEND: "" 26 | OUTPUT_TYPE: "LINEARIZED" 27 | TEXTRACT_API: GENERIC 28 | OUTPUT_FEATURES: "TABLES,FORMS,SIGNATURES" 29 | Metadata: 30 | Dockerfile: Dockerfile 31 | DockerContext: . 32 | DockerTag: python3.9-v1 33 | 34 | -------------------------------------------------------------------------------- /lambda/generatecsv/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | sam local invoke -e events/event-meta-lending.json -n env-no-meta-lending.json 4 | sam local invoke -e events/event-meta-lending.json -n env-meta-data-lending.json 5 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | COPY app/requirements.txt ${LAMBDA_TASK_ROOT}/ 4 | RUN python -m pip install -r ${LAMBDA_TASK_ROOT}/requirements.txt --target "${LAMBDA_TASK_ROOT}" 5 | 6 | # Copy function code 7 | COPY app/* ${LAMBDA_TASK_ROOT}/ 8 | 9 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 10 | CMD [ "main.handler" ] 11 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/pdf_mapper_for_fhir/app/__init__.py -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m main $1 4 | else 5 | exec /usr/local/bin/python -m main $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/app/fhir_doc_assembler.py: -------------------------------------------------------------------------------- 1 | import base64 2 | 3 | 4 | class FhirDocAssembler(): 5 | def __init__(self): 6 | self.attachments = [] 7 | 8 | # def add_attachment(self, raw_text): 9 | # encoded_text = base64.b64encode(str.encode(raw_text)).decode('utf-8') 10 | # self.attachments.append({ 11 | # 'attachment': { 12 | # 'data': encoded_text, 13 | # 'contentType': 'text/plain' 14 | # } 15 | # }) 16 | 17 | def get_fhir_doc(self, raw_text, doc_id): 18 | encoded_text = base64.b64encode(str.encode(raw_text)).decode('utf-8') 19 | return { 20 | 'resourceType': 'DocumentReference', 21 | 'status': 'superseded', 22 | 'subject': { 23 | "reference": ' ' 24 | }, 25 | 'content': [ 26 | { 27 | 'attachment': { 28 | 'data': encoded_text, 29 | 'contentType': 'text/plain' 30 | } 31 | } 32 | ], 33 | "relatesTo": [ 34 | { 35 | "code": "appends", 36 | "target": { 37 | "reference": f"DocumentReference/{doc_id}" 38 | } 39 | } 40 | ], 41 | } 42 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/app/main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import uuid 3 | import boto3 4 | import os 5 | import traceback 6 | import trp 7 | import logging 8 | from urllib.parse import urlparse 9 | from fhir_doc_assembler import FhirDocAssembler 10 | from send_to_healthlake import send_to_healthlake 11 | 12 | logger = logging.getLogger('SendToHealthlake') 13 | logger.addHandler(logging.StreamHandler()) 14 | logger.setLevel(getattr(logging, os.getenv('LOG_LEVEL', 'INFO'))) 15 | client = boto3.client('s3') 16 | 17 | 18 | # Broken out into separate Lambda function in case there are slow-downs in textract, 19 | # the document is very large, or in general if the time needed for Textract exceeds 20 | # the 15-minute limit of Lambda. Also, no need to leave a Lambda function running 21 | # while it actively polls Textract when Textract is perfectly happy notifying us on 22 | # SNS when it's ready for us to come back :-) 23 | 24 | def handler(event, context): 25 | logger.debug(event) 26 | try: 27 | if event.get('textract_result'): 28 | doc_id = uuid.uuid4() 29 | output_json = event['textract_result']['TextractOutputJsonPath'] 30 | bucket = urlparse(output_json).hostname 31 | object_key = urlparse(output_json).path[1:] 32 | logger.debug(f'Bucket: {bucket}') 33 | logger.debug(f'Key: {object_key}') 34 | resp = client.get_object(Bucket=bucket, Key=object_key) 35 | blocks = json.loads(resp['Body'].read()) 36 | document = trp.Document(blocks) 37 | assembler = FhirDocAssembler() 38 | logger.info(f'The document has {len(document.pages)} pages') 39 | logger.info(f'Unique ID for this document is: {doc_id}') 40 | for page in document.pages: 41 | send_to_healthlake(assembler.get_fhir_doc(page.text, doc_id)) 42 | logger.debug(f"Adding attachment {page.text}") 43 | else: 44 | raise RuntimeError('Invalid lambda event.') 45 | except Exception as e: 46 | traceback.print_exc() 47 | raise e 48 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/app/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | requests_auth_aws_sigv4 3 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/app/send_to_healthlake.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import os 4 | import boto3 5 | import logging 6 | from requests_auth_aws_sigv4 import AWSSigV4 7 | 8 | logger = logging.getLogger('SendToHealthlake') 9 | logger.addHandler(logging.StreamHandler()) 10 | logger.setLevel(getattr(logging, os.getenv('LOG_LEVEL', 'INFO'))) 11 | 12 | 13 | def send_to_healthlake(fhir_doc): 14 | session = boto3.session.Session() 15 | auth = AWSSigV4("healthlake", session=session) 16 | if DATA_STORE_ENDPOINT := os.environ.get('HEALTHLAKE_ENDPOINT'): 17 | DATA_STORE_ENDPOINT += "DocumentReference" 18 | else: 19 | logger.error('Missing Healthlake Endpoint. Unbale to continue') 20 | raise ValueError('Missing Healthlake Endpoint env.var.') 21 | # Calling data store FHIR endpoint using SigV4 auth 22 | logger.debug(f"Sending to HealthLake {json.dumps(fhir_doc)}") 23 | resp = requests.post(DATA_STORE_ENDPOINT, json=fhir_doc, auth=auth) 24 | resp.raise_for_status() 25 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "Parameters": { 3 | "HEALTHLAKE_ENDPOINT": "https://healthlake.us-east-2.amazonaws.com/datastore/83c10afe8566667ac2489c8d989b2c14/r4/", 4 | "SNS_ARN": "arn:aws:sns:us-east-2:109881088269:Textract", 5 | "ROLE_ARN": "arn:aws:iam::109881088269:role/Role_Textract", 6 | "LOG_LEVEL": "DEBUG" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": 3 | { 4 | "s3Path": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/uploads/fax-190517121115-6784593217-11_Redacted.pdf" 5 | }, 6 | "mime": "application/pdf", 7 | "classification": null, 8 | "numberOfPages": 2, 9 | "textract_result": 10 | { 11 | "TextractTempOutputJsonPath": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/temp/a8cdb100b8414882978cef68cf49c7ed1e47a68680ffa9fd442a69ae8c05bf4c", 12 | "TextractOutputJsonPath": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/textract-output/fax-190517121115-6784593217-11_Redacted2023-04-19T18:49:29.225383/fax-190517121115-6784593217-11_Redacted.json" 13 | } 14 | } -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Lambda function for PDF Mapper for FHIR 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | PdfMapperForFhirFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | HEALTHLAKE_ENDPOINT: https://healthlake.us-east-2.amazonaws.com/datastore/83c10afe8566667ac2489c8d989b2c14/r4/ 22 | LOG_LEVEL: INFO 23 | Metadata: 24 | Dockerfile: Dockerfile 25 | DockerContext: . 26 | DockerTag: python3.9-v1 27 | 28 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -n env.json -e events/event.json 3 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/tests/data/sample_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queriesConfig": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": "[*]" 12 | }], 13 | "classification": "APPLICATION", 14 | } 15 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/tests/data/simple_feature_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queries_config": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": "[*]" 12 | }] 13 | } 14 | -------------------------------------------------------------------------------- /lambda/pdf_mapper_for_fhir/tests/test_pdf_mapper_for_fhir.py: -------------------------------------------------------------------------------- 1 | import json 2 | import io 3 | import os 4 | import boto3 5 | 6 | current_folder = os.path.dirname(os.path.realpath(__file__)) 7 | 8 | 9 | def test_serializer_manifest(caplog): 10 | s3_bucket = 'sdx-textract-us-east-1' 11 | s3_key = 'sample_manifest.json' 12 | s3_client = boto3.client('s3') 13 | o = s3_client.get_object(Bucket=s3_bucket, Key=s3_key) 14 | file_content = o.get('Body').read().decode('utf-8') 15 | json_content = json.loads(file_content) 16 | assert json_content 17 | assert json_content['s3Path'] 18 | assert json_content['textractFeatures'] 19 | assert len(json_content['textractFeatures']) == 3 20 | -------------------------------------------------------------------------------- /lambda/put_on_sqs/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | 4 | # Copy function code 5 | COPY app/* ${LAMBDA_TASK_ROOT}/ 6 | 7 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 8 | CMD [ "main.lambda_handler" ] 9 | -------------------------------------------------------------------------------- /lambda/put_on_sqs/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/put_on_sqs/app/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | import json 5 | import logging 6 | import os 7 | import boto3 8 | import json 9 | 10 | logger = logging.getLogger(__name__) 11 | __version__ = "0.0.1" 12 | sqs_client = boto3.client('sqs') 13 | 14 | 15 | def lambda_handler(event, _): 16 | log_level = os.environ.get('LOG_LEVEL', 'INFO') 17 | logger.setLevel(log_level) 18 | logger.debug(json.dumps(event)) 19 | logger.debug(f"version: {__version__}") 20 | logger.debug(f"boto3 version: {boto3.__version__}") 21 | 22 | sqs_queue_url = os.environ.get('SQS_QUEUE_URL', '') 23 | if not sqs_queue_url: 24 | raise ValueError(f'no SQS_QUEUE defined: {sqs_queue_url}') 25 | 26 | response = sqs_client.send_message(QueueUrl=sqs_queue_url, 27 | MessageBody=json.dumps(event)) 28 | logger.debug(response) 29 | -------------------------------------------------------------------------------- /lambda/put_on_sqs/app/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/put_on_sqs/app/requirements.txt -------------------------------------------------------------------------------- /lambda/put_on_sqs/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "PutOnSQSFunction": { 3 | "SQS_QUEUE_URL": "https://sqs.us-east-1.amazonaws.com/913165245630/idp-stack-python-sample-1-textractasynctaskStartTPSRequests988C50E9-ObdXs98lMdHc", 4 | "LOG_LEVEL": "DEBUG" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /lambda/put_on_sqs/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "Token": "AQCUAAAAKgAAAAMAAAAAAAAAAScySGVHlgFmBDVYD6a8ltN6T9sIC2E2c+pzSZMimkamYs7yQqUgz3QBGHVmaWxPE19esi6AhnOy9VAoyLPlY8Mp41xuEOE790EpAnGOpD/dytQLwUiYvRrQ3xc39l4=3OtkLO6CRweuCEt3aL8aLSYaOaBdxzIUP5TXmZcAqZkFrW7v8hrHCmJnfVqyqQnWH+gDmhCG6chzGSt+2TkPmiT7Pzol6bkvzWYjpuytO9drcJMJne58Vd6oLqQ3jUxqd9t01WeTndbHwyV6wE1vp1AuQuSL/3I6PpuQpuoMvJOd82QlQ9rpR8DaY4T4FIH50fyHF6mMARsteDcTzNSuriyKRIFjPhYFNw5MnGPbc5eYvHIxikRzq+fiYCr6bYlXjn5XrG3uz8hYz48EWcHScdNvc3IraqVwf+oXsGO/74x0E+a6StVrP1DmCk0Siof8hCPSehdomEt0+ivXW0mD+GcsfjOOnlDIZ9E2EYrNjl1aCMe7225AgwsqFb48cbh5ekHQMtZhy3VUPhmaygyqMTujGT7kU4JlFRumtPs9D6FW+NkNiE6VjCCNz4Q0/6uH3jtwRkEon4ozbnxkJLQsXujSA52Iig7/0H9vL7ZNe3oeKFIyD0OGQeryetRwkC68xKtbZc626ZLEhXRRSR0Z", 3 | "Payload": { 4 | "manifest": { 5 | "S3Path": "s3://idp-stack-python-sample-schademcdkidpstackpython-pha5ty3i0jbi/uploads/98.blabla" 6 | }, 7 | "mime": "image/jpeg", 8 | "classification": null, 9 | "numberOfPages": 1 10 | }, 11 | "ExecutionId": "arn:aws:states:us-east-1:913165245630:execution:IDPWorkflowPython85B937F9-Pnw2MyHSOXR8:1992869c-cc18-4b3c-91ca-d15fcebdcc14" 12 | } 13 | -------------------------------------------------------------------------------- /lambda/put_on_sqs/events/simple-event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "S3Path": "s3://sdx-textract-us-east-1/employeeapp20210510.png" 4 | }, 5 | "mime": "image/png", 6 | "numberOfPages": 1 7 | } 8 | -------------------------------------------------------------------------------- /lambda/put_on_sqs/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | PutOnSQSFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | SQS_QUEUE_URL: https://sqs.us-east-1.amazonaws.com/913165245630/testqueue 22 | LOG_LEVEL: DEBUG 23 | Metadata: 24 | Dockerfile: Dockerfile 25 | DockerContext: . 26 | DockerTag: python3.9-v1 27 | 28 | -------------------------------------------------------------------------------- /lambda/put_on_sqs/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/put_on_sqs/tests/data/sample_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queries_config": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": "[*]" 12 | }], 13 | "classification": [ 14 | "APPLICATION" 15 | ], 16 | "apiCall":["SYNC"], 17 | "flowFeatures": [] 18 | } 19 | -------------------------------------------------------------------------------- /lambda/put_on_sqs/tests/data/simple_feature_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queries_config": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": "[*]" 12 | }] 13 | } 14 | -------------------------------------------------------------------------------- /lambda/rds_serverless_init/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/rds_serverless_init/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/rds_serverless_init/app/main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import boto3 4 | import json 5 | 6 | logger = logging.getLogger(__name__) 7 | __version__ = "0.0.1" 8 | 9 | rds_data = boto3.client('rds-data') 10 | 11 | 12 | def on_create(event, context, secret_arn, cluster_arn): 13 | physical_id = 'initPostgresql' 14 | rds_data.execute_statement( 15 | secretArn=secret_arn, 16 | database='postgres', 17 | resourceArn=cluster_arn, 18 | sql= 19 | "CREATE TABLE if not exists textractcsvimport (createtime timestamp, classification varchar(512), \ 20 | filename text, \ 21 | page decimal(10), \ 22 | key text, \ 23 | key_confidence varchar(25), \ 24 | value text, \ 25 | value_confidence varchar(25), \ 26 | key_bb_top varchar(25), \ 27 | key_bb_height varchar(25), \ 28 | k_bb_width varchar(25), \ 29 | k_bb_left varchar(25), \ 30 | v_bb_top varchar(25), \ 31 | v_bb_height varchar(25), \ 32 | v_bb_width varchar(25), \ 33 | v_bb_left varchar(25));" 34 | 35 | # page_number, key_name, key_confidence, value_name, value_confidence, key-bounding-box.top, key-bounding-box.height, k-bb.width, k-bb.left, value-bounding-box.top, v-bb.height, v-bb.width, v-bb.left 36 | ) 37 | return { 'PhysicalResourceId': physical_id } 38 | 39 | 40 | def on_update(event, context, secret_arn, cluster_arn): 41 | physical_id = 'initPostgresql' 42 | rds_data.execute_statement( 43 | secretArn=secret_arn, 44 | database='postgres', 45 | resourceArn=cluster_arn, 46 | sql= 47 | "CREATE TABLE if not exists textractcsvimport (createtime timestamp, classification varchar(512), \ 48 | filename text, \ 49 | page decimal(10), \ 50 | key text, \ 51 | key_confidence varchar(25), \ 52 | value text, \ 53 | value_confidence varchar(25), \ 54 | key_bb_top varchar(25), \ 55 | key_bb_height varchar(25), \ 56 | k_bb.width varchar(25), \ 57 | k_bb_left varchar(25), \ 58 | v_bb_top varchar(25), \ 59 | v_bb_height varchar(25), \ 60 | v_bb_width varchar(25), \ 61 | v_bb_left varchar(25));" 62 | 63 | # page_number, key_name, key_confidence, value_name, value_confidence, key-bounding-box.top, key-bounding-box.height, k-bb.width, k-bb.left, value-bounding-box.top, v-bb.height, v-bb.width, v-bb.left 64 | ) 65 | print("update resource %s with props %s" % (physical_id)) 66 | 67 | 68 | def on_delete(event, context): 69 | physical_id = 'initPostgresql' 70 | print("delete resource %s" % physical_id) 71 | 72 | def lambda_handler(event, context): 73 | log_level = os.environ.get('LOG_LEVEL', 'INFO') 74 | logger.setLevel(log_level) 75 | logger.info(json.dumps(event)) 76 | logger.debug(f"version: {__version__}") 77 | logger.debug(f"boto3 version: {boto3.__version__}") 78 | logger.info(event) 79 | cluster_arn = os.environ.get('CLUSTER_ARN', '') 80 | secret_arn = os.environ.get('SECRET_ARN', '') 81 | logger.info(f'SECRET_ARN: {secret_arn}\nCLUSTER_ARN: {cluster_arn}') 82 | if not secret_arn: 83 | raise ValueError(f'no SECRET_ARN defined: {secret_arn}') 84 | if not cluster_arn: 85 | raise ValueError(f'no CLUSTER_ARN defined: {cluster_arn}') 86 | request_type = event['RequestType'].lower() 87 | if request_type == 'create': 88 | return on_create(event=event, 89 | context=context, 90 | cluster_arn=cluster_arn, 91 | secret_arn=secret_arn) 92 | if request_type == 'update': 93 | return on_update(event=event, 94 | context=context, 95 | cluster_arn=cluster_arn, 96 | secret_arn=secret_arn) 97 | if request_type == 'delete': 98 | return on_delete(event=event, context=context) 99 | raise Exception(f'Invalid request type: {request_type}') 100 | -------------------------------------------------------------------------------- /lambda/rds_serverless_init/app/requirements.txt: -------------------------------------------------------------------------------- 1 | cfnresponse 2 | -------------------------------------------------------------------------------- /lambda/rds_serverless_init/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "PutOnSQSFunction": { 3 | "SQS_QUEUE_URL": "https://sqs.us-east-1.amazonaws.com/913165245630/idp-stack-python-sample-1-textractasynctaskStartTPSRequests988C50E9-ObdXs98lMdHc", 4 | "LOG_LEVEL": "DEBUG" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /lambda/rds_serverless_init/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "Token": "AQCUAAAAKgAAAAMAAAAAAAAAAScySGVHlgFmBDVYD6a8ltN6T9sIC2E2c+pzSZMimkamYs7yQqUgz3QBGHVmaWxPE19esi6AhnOy9VAoyLPlY8Mp41xuEOE790EpAnGOpD/dytQLwUiYvRrQ3xc39l4=3OtkLO6CRweuCEt3aL8aLSYaOaBdxzIUP5TXmZcAqZkFrW7v8hrHCmJnfVqyqQnWH+gDmhCG6chzGSt+2TkPmiT7Pzol6bkvzWYjpuytO9drcJMJne58Vd6oLqQ3jUxqd9t01WeTndbHwyV6wE1vp1AuQuSL/3I6PpuQpuoMvJOd82QlQ9rpR8DaY4T4FIH50fyHF6mMARsteDcTzNSuriyKRIFjPhYFNw5MnGPbc5eYvHIxikRzq+fiYCr6bYlXjn5XrG3uz8hYz48EWcHScdNvc3IraqVwf+oXsGO/74x0E+a6StVrP1DmCk0Siof8hCPSehdomEt0+ivXW0mD+GcsfjOOnlDIZ9E2EYrNjl1aCMe7225AgwsqFb48cbh5ekHQMtZhy3VUPhmaygyqMTujGT7kU4JlFRumtPs9D6FW+NkNiE6VjCCNz4Q0/6uH3jtwRkEon4ozbnxkJLQsXujSA52Iig7/0H9vL7ZNe3oeKFIyD0OGQeryetRwkC68xKtbZc626ZLEhXRRSR0Z", 3 | "Payload": { 4 | "manifest": { 5 | "S3Path": "s3://idp-stack-python-sample-schademcdkidpstackpython-pha5ty3i0jbi/uploads/98.blabla" 6 | }, 7 | "mime": "image/jpeg", 8 | "classification": null, 9 | "numberOfPages": 1 10 | }, 11 | "ExecutionId": "arn:aws:states:us-east-1:913165245630:execution:IDPWorkflowPython85B937F9-Pnw2MyHSOXR8:1992869c-cc18-4b3c-91ca-d15fcebdcc14" 12 | } 13 | -------------------------------------------------------------------------------- /lambda/rds_serverless_init/events/simple-event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "S3Path": "s3://sdx-textract-us-east-1/employeeapp20210510.png" 4 | }, 5 | "mime": "image/png", 6 | "numberOfPages": 1 7 | } 8 | -------------------------------------------------------------------------------- /lambda/rds_serverless_init/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | PutOnSQSFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | SQS_QUEUE_URL: https://sqs.us-east-1.amazonaws.com/913165245630/testqueue 22 | LOG_LEVEL: DEBUG 23 | Metadata: 24 | Dockerfile: Dockerfile 25 | DockerContext: . 26 | DockerTag: python3.9-v1 27 | 28 | -------------------------------------------------------------------------------- /lambda/rds_serverless_init/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/searchablePDF/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | RUN python -m pip install amazon-textract-idp-cdk-manifest marshmallow amazon-textract-response-parser==0.1.44 pymupdf --target "${LAMBDA_TASK_ROOT}" 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/searchablePDF/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/searchablePDF/app/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: Apache-2.0 3 | from io import BytesIO 4 | import json 5 | import logging 6 | from typing import List, Tuple 7 | 8 | import boto3 9 | import fitz 10 | import trp 11 | from trp.t_pipeline import order_blocks_by_geo 12 | from trp.trp2 import TDocument, TDocumentSchema 13 | 14 | logger = logging.getLogger(__name__) 15 | version = "0.0.6" 16 | s3 = boto3.client('s3') 17 | 18 | 19 | def split_s3_path_to_bucket_and_key(s3_path: str) -> Tuple[str, str]: 20 | if len(s3_path) <= 7 or not s3_path.lower().startswith("s3://"): 21 | raise ValueError( 22 | f"s3_path: {s3_path} is no s3_path in the form of s3://bucket/key." 23 | ) 24 | s3_bucket, s3_key = s3_path.replace("s3://", "").split("/", 1) 25 | return (s3_bucket, s3_key) 26 | 27 | 28 | def get_file_from_s3(s3_path: str, range=None) -> bytes: 29 | s3_bucket, s3_key = split_s3_path_to_bucket_and_key(s3_path) 30 | if range: 31 | o = s3.get_object(Bucket=s3_bucket, Key=s3_key, Range=range) 32 | else: 33 | o = s3.get_object(Bucket=s3_bucket, Key=s3_key) 34 | return o.get('Body').read() 35 | 36 | def lambda_handler(event, _): 37 | logger.setLevel('INFO') 38 | logger.info(f"version: {version}") 39 | logger.info(json.dumps(event)) 40 | 41 | manifest = event.get("manifest") 42 | tesxtract_result = event.get("textract_result") 43 | 44 | # Get the files 45 | pdf_obj = get_file_from_s3(manifest.get('s3Path')) 46 | 47 | logger.info(f"Get PDF {manifest.get('s3Path')}") 48 | 49 | textract_s3_byte = get_file_from_s3(tesxtract_result.get('TextractOutputJsonPath')) 50 | logger.info(f"Get Textract JSON {tesxtract_result.get('TextractOutputJsonPath')}") 51 | 52 | logger.info("Reading PDF") 53 | # Read the PDF 54 | pdfdoc = fitz.open("pdf", stream=BytesIO(pdf_obj)) 55 | 56 | logger.info("Reading the JSON") 57 | #Read the JSON 58 | textract_json = json.loads(textract_s3_byte) 59 | temp_doc = TDocumentSchema().load(textract_json) 60 | 61 | logger.info("Loading into TRP and Ordering Blocks by Geo") 62 | #Load into TRP and Order Blocks 63 | ordered_doc = order_blocks_by_geo(temp_doc) 64 | 65 | trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc)) 66 | 67 | logger.info("Parsing the Text and writing to PDF Hidden Layer") 68 | # parse the detect text into words and write to PDF 69 | 70 | font= fitz.Font("Courier") 71 | # enumerate the textract .pages class to get it ordered by page 72 | for i,page in enumerate(pdfdoc): 73 | tPage = trp_doc.pages[i] 74 | tw = fitz.TextWriter(page.rect) 75 | lines = list(tPage.lines) 76 | words = [ 77 | { 78 | "xmin": round(word.geometry.boundingBox.left * page.rect.width), 79 | "ymin": round(word.geometry.boundingBox.top * page.rect.height), 80 | "xmax": round(round(word.geometry.boundingBox.left * page.rect.width) + (word.geometry.boundingBox.width * page.rect.width)), 81 | "ymax": round(round(word.geometry.boundingBox.top * page.rect.height) + (word.geometry.boundingBox.height * page.rect.height)), 82 | "text": word.text 83 | } 84 | for line in lines for word in line.words 85 | ] 86 | for word in words: 87 | font_size = 10 88 | PDF_Width = fitz.get_text_length(text=word.get('text'), fontname="Courier", fontsize=font_size) 89 | OCR_Width = (word.get('xmax') - word.get('xmin')) 90 | while PDF_Width > OCR_Width: 91 | font_size = font_size - 1 92 | PDF_Width = fitz.get_text_length(text=word.get('text'), fontname="Courier", fontsize=font_size) 93 | tw.append(pos=(word.get('xmin'), word.get('ymax')), text=word.get('text'), font=font, fontsize=font_size) 94 | tw.write_text(page, render_mode=0, color=(0, 1, 0)) 95 | s3_bucket, s3_key = split_s3_path_to_bucket_and_key(manifest.get('s3Path')) 96 | outputKey = f"pdf_output/{s3_key.split('/')[-1].split('.')[0]}_searchable.pdf" 97 | logger.info(f"Saving to {s3_bucket}/{outputKey}") 98 | bytes_ = BytesIO(pdfdoc.write()) 99 | s3.put_object(Body=bytes_, Bucket=s3_bucket, Key=outputKey) 100 | 101 | return {"message": f"Finished Saving to {s3_bucket}/{outputKey}"} 102 | -------------------------------------------------------------------------------- /lambda/searchablePDF/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "s3Path": "s3://simplesearchpdf-textractsimpleasyncworkflow2d7d5b-zzwnp9hesdtg/uploads/MVPOA_Restrictions_demo1000.pdf" 4 | }, 5 | "mime": "application/pdf", 6 | "classification": null, 7 | "numberOfPages": 99, 8 | "textract_result": { 9 | "TextractTempOutputJsonPath": "s3://simplesearchpdf-textractsimpleasyncworkflow2d7d5b-zzwnp9hesdtg/textract-temp-output/99046b1ea27472554066899f486513822b4f6aa1b4efb86b574192615ec5b6a2", 10 | "TextractOutputJsonPath": "s3://simplesearchpdf-textractsimpleasyncworkflow2d7d5b-zzwnp9hesdtg/textract-output/MVPOA_Restrictions_demo10002023-05-10T13:52:58.624870/MVPOA_Restrictions_demo1000.json" 11 | } 12 | } -------------------------------------------------------------------------------- /lambda/searchablePDF/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for building a python lambda locally 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | Function: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Metadata: 20 | Dockerfile: Dockerfile 21 | DockerContext: . 22 | DockerTag: python3.9-v1 23 | 24 | -------------------------------------------------------------------------------- /lambda/searchablePDF/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json 3 | -------------------------------------------------------------------------------- /lambda/textract_async/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | RUN python -m pip install pypdf[full] Pillow filetype amazon-textract-caller==0.2.0 amazon-textract-idp-cdk-manifest marshmallow --upgrade --target "${LAMBDA_TASK_ROOT}" 5 | 6 | # Copy function code 7 | COPY app/* ${LAMBDA_TASK_ROOT}/ 8 | 9 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 10 | CMD [ "main.lambda_handler" ] 11 | -------------------------------------------------------------------------------- /lambda/textract_async/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/textract_async/app/requirements.txt: -------------------------------------------------------------------------------- 1 | amazon-textract-caller 2 | schadem-tidp-manifest 3 | -------------------------------------------------------------------------------- /lambda/textract_async/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "HelloWorldFunction": { 3 | "S3_OUTPUT_BUCKET": "my-stack-dev-documentbucket04c71448-7en8gx904sk5", 4 | "S3_OUTPUT_PREFIX": "textract-output", 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /lambda/textract_async/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | SyncFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | S3_OUTPUT_PREFIX: textract-output 22 | S3_OUTPUT_BUCKET: my-stack-dev-documentbucket04c71448-7en8gx904sk5 23 | NOTIFICATION_ROLE_ARN: arn:aws:iam::913165245630:role/my-stack-dev-textracttaskTextractAsyncSNSRole9C109-1U9GNG425CDEO 24 | NOTIFICATION_SNS: arn:aws:sns:us-east-1:913165245630:my-stack-dev-textracttaskTextractAsyncSNS3C29E077-16AORMCCGLOEV 25 | TOKEN_STORE_DDB: my-stack-dev-textracttaskTextractTaskTokenTableC181A801-DTM6Z7GOJH28 26 | S3_TEMP_OUTPUT_PREFIX: textract-temp-output 27 | Metadata: 28 | Dockerfile: Dockerfile 29 | DockerContext: . 30 | DockerTag: python3.9-v1 31 | 32 | -------------------------------------------------------------------------------- /lambda/textract_async/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -e events/event.json -n env.json 3 | -------------------------------------------------------------------------------- /lambda/textract_async_sns_listener/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | RUN python -m pip install amazon-textract-caller>=0.0.23 --target "${LAMBDA_TASK_ROOT}" 5 | 6 | # Copy function code 7 | COPY app/* ${LAMBDA_TASK_ROOT}/ 8 | 9 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 10 | CMD [ "main.lambda_handler" ] 11 | -------------------------------------------------------------------------------- /lambda/textract_async_sns_listener/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/textract_async_sns_listener/app/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/textract_async_sns_listener/app/requirements.txt -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | RUN /var/lang/bin/python -m pip install --upgrade pip 3 | COPY app/requirements.txt ${LAMBDA_TASK_ROOT}/ 4 | RUN python -m pip install -r ${LAMBDA_TASK_ROOT}/requirements.txt --target "${LAMBDA_TASK_ROOT}" 5 | 6 | # Copy function code 7 | COPY app/* ${LAMBDA_TASK_ROOT}/ 8 | 9 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 10 | CMD [ "main.handler" ] 11 | -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/textract_comprehend_medical/app/__init__.py -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m main $1 4 | else 5 | exec /usr/local/bin/python -m main $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/app/main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import uuid 3 | import boto3 4 | import os 5 | import traceback 6 | import trp 7 | import logging 8 | from urllib.parse import urlparse 9 | 10 | logger = logging.getLogger('SendToComprehendMedical') 11 | logger.addHandler(logging.StreamHandler()) 12 | logger.setLevel(getattr(logging, os.getenv('LOG_LEVEL', 'INFO'))) 13 | client = boto3.client('s3') 14 | cm_client = boto3.client('comprehendmedical') 15 | cm_job_types = [('ICD10', 'start_icd10_cm_inference_job'), 16 | ('SNOMEDCT', 'start_snomedct_inference_job'), 17 | ('RXNORM', 'start_rx_norm_inference_job'), 18 | ('DETECT_ENTITIES_V2', 'start_entities_detection_v2_job'), 19 | ('DETECT_PHI', 'start_phi_detection_job')] 20 | 21 | 22 | # Broken out into separate Lambda function in case there are slow-downs in textract, 23 | # the document is very large, or in general if the time needed for Textract exceeds 24 | # the 15-minute limit of Lambda. Also, no need to leave a Lambda function running 25 | # while it actively polls Textract when Textract is perfectly happy notifying us on 26 | # SNS when it's ready for us to come back :-) 27 | 28 | def handler(event, context): 29 | start_job = None 30 | job_type = os.getenv('COMPREHEND_MEDICAL_JOB_TYPE') 31 | for job in cm_job_types: 32 | if job_type == job[0]: 33 | start_job = getattr(cm_client, job[1]) 34 | if not start_job: 35 | logger.info('There is no valid COMPREHEND_MEDICAL_JOB_TYPE set.') 36 | return 37 | 38 | try: 39 | if event.get('textract_result'): 40 | output_json = event['textract_result']['TextractOutputJsonPath'] 41 | bucket = urlparse(output_json).hostname 42 | object_key = urlparse(output_json).path[1:] 43 | logger.debug(f'Bucket: {bucket}') 44 | logger.debug(f'Key: {object_key}') 45 | resp = client.get_object(Bucket=bucket, Key=object_key) 46 | blocks = json.loads(resp['Body'].read()) 47 | document = trp.Document(blocks) 48 | logger.info(f'The document has {len(document.pages)} pages') 49 | # TODO We can add Bedrock here to send multiple pages to CM based on the context 50 | text_content = "" 51 | job_name = f'job-{uuid.uuid4()}' 52 | object_name = f'textract-output/text/{job_type}/{job_name}/{job_name}.txt' 53 | for page in document.pages: 54 | text_content += page.text 55 | client.put_object(Bucket=bucket, Key=object_name, Body=str.encode(text_content)) 56 | start_job( 57 | InputDataConfig={ 58 | 'S3Bucket': bucket, 59 | 'S3Key': f'textract-output/text/{job_type}/{job_name}' 60 | }, 61 | OutputDataConfig={ 62 | 'S3Bucket': bucket, 63 | 'S3Key': f'cm-output/json/{job_type}/{job_name}' 64 | }, 65 | JobName=job_name, 66 | DataAccessRoleArn=os.getenv('COMPREHEND_MEDICAL_ROLE'), 67 | LanguageCode='en' 68 | ) 69 | else: 70 | raise RuntimeError('Invalid lambda event.') 71 | except Exception as e: 72 | traceback.print_exc() 73 | raise e -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/app/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | requests_auth_aws_sigv4 3 | -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/env.json: -------------------------------------------------------------------------------- 1 | { 2 | "Parameters": { 3 | "SNS_ARN": "arn:aws:sns:us-east-2:123456789012:Textract", 4 | "ROLE_ARN": "arn:aws:iam::123456789012:role/Role_Textract", 5 | "LOG_LEVEL": "DEBUG" 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": 3 | { 4 | "s3Path": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/uploads/fax-190517121115-6784593217-11_Redacted.pdf" 5 | }, 6 | "mime": "application/pdf", 7 | "classification": null, 8 | "numberOfPages": 2, 9 | "textract_result": 10 | { 11 | "TextractTempOutputJsonPath": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/temp/a8cdb100b8414882978cef68cf49c7ed1e47a68680ffa9fd442a69ae8c05bf4c", 12 | "TextractOutputJsonPath": "s3://pdfmappertofhirworkflow-textractsimplesyncworkflo-13ne9s3betcpj/textract-output/fax-190517121115-6784593217-11_Redacted2023-04-19T18:49:29.225383/fax-190517121115-6784593217-11_Redacted.json" 13 | } 14 | } -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Lambda function for PDF Mapper for FHIR 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | PdfMapperForFhirFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | HEALTHLAKE_ENDPOINT: https://healthlake.us-east-2.amazonaws.com/datastore/83c10afe8566667ac2489c8d989b2c14/r4/ 22 | LOG_LEVEL: INFO 23 | Metadata: 24 | Dockerfile: Dockerfile 25 | DockerContext: . 26 | DockerTag: python3.9-v1 27 | 28 | -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build 2 | sam local invoke -n env.json -e events/event.json 3 | -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/tests/data/sample_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queriesConfig": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": "[*]" 12 | }], 13 | "classification": "APPLICATION", 14 | } 15 | -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/tests/data/simple_feature_manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3Path": "s3://amazon-textract-public-content/blogs/employeeapp20210510.png", 3 | "textractFeatures": [ 4 | "FORMS", 5 | "TABLES", 6 | "QUERIES" 7 | ], 8 | "queries_config": [{ 9 | "text": "What is the applicant full name?", 10 | "alias": "FULL_NAME", 11 | "pages": "[*]" 12 | }] 13 | } 14 | -------------------------------------------------------------------------------- /lambda/textract_comprehend_medical/tests/test_pdf_mapper_for_fhir.py: -------------------------------------------------------------------------------- 1 | import json 2 | import io 3 | import os 4 | import boto3 5 | 6 | current_folder = os.path.dirname(os.path.realpath(__file__)) 7 | 8 | 9 | def test_serializer_manifest(caplog): 10 | s3_bucket = 'sdx-textract-us-east-1' 11 | s3_key = 'sample_manifest.json' 12 | s3_client = boto3.client('s3') 13 | o = s3_client.get_object(Bucket=s3_bucket, Key=s3_key) 14 | file_content = o.get('Body').read().decode('utf-8') 15 | json_content = json.loads(file_content) 16 | assert json_content 17 | assert json_content['s3Path'] 18 | assert json_content['textractFeatures'] 19 | assert len(json_content['textractFeatures']) == 3 20 | -------------------------------------------------------------------------------- /lambda/textract_sync/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | RUN python -m pip install amazon-textract-caller==0.2.0 amazon-textract-idp-cdk-manifest marshmallow --target "${LAMBDA_TASK_ROOT}" 5 | RUN python -m pip install --force-reinstall boto3==1.24.70 --target "${LAMBDA_TASK_ROOT}" 6 | 7 | # Copy function code 8 | COPY app/* ${LAMBDA_TASK_ROOT}/ 9 | 10 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 11 | CMD [ "sync_main.lambda_handler" ] 12 | -------------------------------------------------------------------------------- /lambda/textract_sync/app/entry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "${AWS_LAMBDA_RUNTIME_API}" ]; then 3 | exec /usr/bin/aws-lambda-rie /usr/local/bin/python -m awslambdaric $1 4 | else 5 | exec /usr/local/bin/python -m awslambdaric $1 6 | fi 7 | -------------------------------------------------------------------------------- /lambda/textract_sync/app/requirements.txt: -------------------------------------------------------------------------------- 1 | amazon-textract-caller 2 | -------------------------------------------------------------------------------- /lambda/textract_sync/env.json: -------------------------------------------------------------------------------- 1 | 2 | 3 | "HelloWorldFunction": { 4 | "S3_OUTPUT_PREFIX": "textract-output", 5 | "S3_OUTPUT_BUCKET": "schademcdkstackpaystubst-schademcdkidpstackpython-1fvi0dqoz24lj", 6 | "SQS_QUEUE_URL": "SchademCdkStackPaystubStack-textractsynctaskSyncRequestsBC26E72B-kID9dZUtmCZM" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /lambda/textract_sync/events/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "Records": [ 3 | { 4 | "messageId": "e9f32640-f517-422a-b22b-aa5f9630b545", 5 | "receiptHandle": "AQEBFsALOXYR+wBoaOuipqckuug4nzo9nZu3kQn6/29y3fGqgCZA4qC2rmBQh4Gwk1aMRZw1Ln36WAuua11krVJk965MLskbZX+ssop0PmoOfKIO/amBDVKii1GoONjSDf4iEjskylhl6hsWerMbZVV9EtB7Ns3DwjfSGW/KjkrFXq0xSU3E6kJQyvT4mL/QYyZxMQioLmcFVIEUEXjJAiXGdkCvA07WA2ZxGxiMNg5xGCuQJZweG8Lu/jGVuz8OGP3pra6rI2PsLBYuQyi2wuXjPC08hThV82O/FMPwMK8q/LgoHQ0LqSHPIP4tzC2C6qwc9POPxcVStMKcwXmqFVAWu8mi9Qeuen/OYi7LE7hqsSWBopKmtH+EKOuXufqvzH6oIbHJbTYe8tMA9Wo/k3vMpZdoyy1YTU5NQRxBMP3JT0dFdUSRlLug/teEzVQphqwS78xsVArIflkEaOIVRMnQ2bF/WelPLW8AxEEZii4eI4E=", 6 | "body": "{\"Token\": \"AQCcAAAAKgAAAAMAAAAAAAAAAYLAykXaD0L6sxF5tTZdGf4fFXbifBuRyL9GYJQgebAV6p/Z6RWmH3SA+sXA9AnHizaVC/1Hk8WJ+9M0v2+ns1rKhG+VRqc0C+zdxZApsXgiqVXczaDMdAZtR7Tl16WY5sgPKIc=5bN+WqJIdPClk/vy94JveNPfoS2k4dvoTRBknpSWkEIluL1PftteQID4q7FpVhdgtemfWLgZGh+quOA+8ZL3hJVVCYvlsU8/R3jv2XpTCvWTB3+F1okR4v27I/37V8C5MppyGmM4IrHmiYQcUjxAI8oks+RAZNVXaQbzg/qWadyx1KOmXG55iA9FDSx3Z04mNOh3WoVKZXqGmjjqw6q1S/zZXor+KIZO4ZwygwGnR93o4uaxhbG+nzMWyIRtEdqHKVx5SWGDbVysSj1p7h4ITM/pAq2SKj6aNtbjpulkuN/YTaNSk3PqYYLnWKP37+1KZr457zh8kCEwhoSfGwSHb0KXobkag9BnUZHADcvSvm0tePNr1ucdWYJrnEq1El6AfjOagYdEF5fMGVB7Koqy4PtP1wh4IWPaklbdQDZ5LNlpNNfUVRqTiR2pbhb+myW8BGamIm+KSES4W5/U6GxyCk1cAQovCM4BMTFOuv5uvbJMGe9BYEn+Vqq2a/6+yzUUS3CvKoVtqmx8wgdNdyOd\", \"Payload\": {\"manifest\": {\"textractFeatures\": [\"FORMS\", \"QUERIES\"], \"s3Path\": \"s3://schademcdkstackpaystuban-schademcdkidpstackpaystu-bt0j5wq0zftu/uploads/000429.pdf-1.jpg\", \"queriesConfig\": [{\"alias\": \"W2_FORM_YEAR\", \"text\": \"What is the form year ?\"}, {\"alias\": \"W2_FORM_TYPE\", \"text\": \"What is the form type ?\"}, {\"alias\": \"W2_EMPLOYEE_SSN\", \"text\": \"What is the Employee SSN ?\"}, {\"alias\": \"W2_EMPLOYER_NAME\", \"text\": \"What is the Employer Name ?\"}, {\"alias\": \"W2_WAGES_TIPS_OTHER\", \"text\": \"What is wages, tips, other compensation amount ?\"}, {\"alias\": \"W2_FEDERAL_INCOME_TAX\", \"text\": \"What is the Federal Income Tax withheld amount ?\"}, {\"alias\": \"W2_SS_WAGES\", \"text\": \"What is the social security wages amount ?\"}, {\"alias\": \"W2_SS_TAX\", \"text\": \"What is the social security Taxes withheld amount ?\"}, {\"alias\": \"W2_12a_VALUE_TYPE\", \"text\": \"What is the value type in Box 12 a ?\"}, {\"alias\": \"W2_12a_VALUE_AMOUNT\", \"text\": \"What is the value amount in Box 12 a ?\"}, {\"alias\": \"W2_12b_VALUE_TYPE\", \"text\": \"What is the value type in Box 12 b ?\"}, {\"alias\": \"W2_12b_VALUE_AMOUNT\", \"text\": \"What is the value amount in Box 12 b ?\"}, {\"alias\": \"W2_12c_VALUE_TYPE\", \"text\": \"What is the value type in Box 12 c ?\"}, {\"alias\": \"W2_13_STATUTORY\", \"text\": \"Is Box 13 Statutory employee selected ?\"}, {\"alias\": \"W2_13_RETIREMENT_PLAN\", \"text\": \"Is Box 13 Retirement plan selected ?\"}, {\"alias\": \"W2_13_THIRD_PARTY_SICK_PAY\", \"text\": \"Is Box 13 Third - party sick pay selected ?\"}]}, \"mime\": \"image/jpeg\", \"classification\": {\"documentType\": \"AWS_W2\"}, \"numberOfPages\": 1, \"Random\": {\"randomNumber\": 20}, \"textract_result\": {\"TextractOutputJsonPath\": \"s3://schademcdkstackpaystuban-schademcdkidpstackpaystu-bt0j5wq0zftu/textract-output/000429.pdf-12022-06-14T01:19:14.763537/000429.pdf-1.json\"}, \"txt_output_location\": {\"TextractOutputCSVPath\": \"s3://schademcdkstackpaystuban-schademcdkidpstackpaystu-bt0j5wq0zftu/txt_output/2022-06-14T01:19:17+00:00/000429.pdf-1.txt\"}}, \"ExecutionId\": \"arn:aws:states:us-east-1:913165245630:execution:PaystubW2WorkflowPythonFCA0DA8F-FBUPIeYaS6Qb:000429pdf-1jpg2022-06-14T011907975165\"}", 7 | "attributes": { 8 | "ApproximateReceiveCount": "1", 9 | "SentTimestamp": "1655169563470", 10 | "SenderId": "AROA5JHHD3S7JITUAYAMG:SchademCdkStackPaystubAnd-textractsynctaskwithconf-hmuPNa9kDDSw", 11 | "ApproximateFirstReceiveTimestamp": "1655169563479" 12 | }, 13 | "messageAttributes": {}, 14 | "md5OfBody": "f65e89960a426fbe9ca091f2690a52b5", 15 | "eventSource": "aws:sqs", 16 | "eventSourceARN": "arn:aws:sqs:us-east-1:913165245630:SchademCdkStackPaystubAndW2Stack-textractsynctaskwithconfigSyncRequ-WeRe6ALWgek7", 17 | "awsRegion": "us-east-1" 18 | } 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /lambda/textract_sync/events/simple-event.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest": { 3 | "S3Path": "s3://sdx-textract-us-east-1/employeeapp20210510.png" 4 | }, 5 | "mime": "image/png", 6 | "numberOfPages": 1 7 | } 8 | -------------------------------------------------------------------------------- /lambda/textract_sync/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Transform: AWS::Serverless-2016-10-31 3 | Description: > 4 | python3.9 5 | 6 | Sample SAM Template for sam-app 7 | 8 | Globals: 9 | Function: 10 | Timeout: 900 11 | 12 | Resources: 13 | SyncFunction: 14 | Type: AWS::Serverless::Function 15 | Properties: 16 | PackageType: Image 17 | Architectures: 18 | - x86_64 19 | Environment: 20 | Variables: 21 | S3_OUTPUT_PREFIX: textract-output 22 | S3_OUTPUT_BUCKET: schademcdkstackpaystubst-schademcdkidpstackpython-1fvi0dqoz24lj 23 | SQS_QUEUE_URL: SchademCdkStackPaystubStack-textractsynctaskSyncRequestsBC26E72B-kID9dZUtmCZM 24 | Metadata: 25 | Dockerfile: Dockerfile 26 | DockerContext: . 27 | DockerTag: python3.9-v1 28 | 29 | -------------------------------------------------------------------------------- /lambda/textract_sync/test_sam_local.sh: -------------------------------------------------------------------------------- 1 | sam build -------------------------------------------------------------------------------- /lambda/workmail_s3/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/lambda/python:3.9-x86_64 2 | 3 | RUN /var/lang/bin/python -m pip install --upgrade pip 4 | 5 | # Copy function code 6 | COPY app/* ${LAMBDA_TASK_ROOT}/ 7 | 8 | # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile) 9 | CMD [ "main.lambda_handler" ] 10 | -------------------------------------------------------------------------------- /lambda/workmail_s3/app/main.py: -------------------------------------------------------------------------------- 1 | from email.message import Message 2 | from botocore.exceptions import ClientError 3 | import boto3 4 | import email 5 | import os 6 | import uuid 7 | from email import policy 8 | from base64 import b64decode 9 | 10 | workmail_message_flow = boto3.client('workmailmessageflow') 11 | s3 = boto3.client('s3') 12 | 13 | def download_email(message_id): 14 | """ 15 | This method downloads full email MIME content using GetRawMessageContent API and uses email.parser class 16 | for parsing it into Python email.message.EmailMessage class. 17 | Reference: 18 | https://docs.python.org/3.7/library/email.message.html#email.message.EmailMessage 19 | https://docs.python.org/3/library/email.parser.html 20 | Parameters 21 | ---------- 22 | message_id: string, required 23 | message_id of the email to download 24 | Returns 25 | ------- 26 | email.message.Message 27 | EmailMessage representation the downloaded email 28 | """ 29 | response = workmail_message_flow.get_raw_message_content(messageId=message_id) 30 | email_content = response['messageContent'].read() 31 | email_generation_policy = policy.SMTP.clone(refold_source='none') 32 | print("Downloaded email from WorkMail successfully") 33 | return email.message_from_bytes(email_content, policy=email_generation_policy) 34 | 35 | 36 | def lambda_handler(event, context): 37 | from_address = event['envelope']['mailFrom']['address'] 38 | subject = event['subject'] 39 | flow_direction = event['flowDirection'] 40 | message_id = event['messageId'] 41 | print(f"Received email with message ID {message_id}, flowDirection {flow_direction}, from {from_address} with Subject {subject}") 42 | 43 | try: 44 | raw_msg = workmail_message_flow.get_raw_message_content(messageId=message_id) 45 | parsed_msg: Message = email.message_from_bytes(raw_msg['messageContent'].read()) 46 | 47 | parsed_email = download_email(message_id) 48 | key = str(uuid.uuid4()) 49 | s3_output_bucket = os.environ.get('S3_OUTPUT_BUCKET') 50 | s3_output_prefix = os.environ.get('S3_OUTPUT_PREFIX') 51 | 52 | if not s3_output_bucket or not s3_output_prefix: 53 | raise ValueError( 54 | f"no s3_output_bucket: {s3_output_bucket} or s3_output_prefix: {s3_output_prefix} defined." 55 | ) 56 | logger.debug(f"LOG_LEVEL: {log_level} \n \ 57 | S3_OUTPUT_BUCKET: {s3_output_bucket} \n \ 58 | S3_OUTPUT_PREFIX: {s3_output_prefix} \n \ 59 | TEXTRACT_API: {textract_api} \n ") 60 | 61 | 62 | # Can take in multiple pdf attachments 63 | if parsed_email.is_multipart(): 64 | # Walk over message parts of this multipart email. 65 | for part in parsed_email.walk(): 66 | content_type = part.get_content_type() 67 | content_disposition = str(part.get_content_disposition()) 68 | file_name = part.get_filename() 69 | if 'attachment' in content_disposition and part.is_attachment() and "application/pdf" in part.get_content_type(): 70 | s3.put_object(Body=bytes(json.dumps(full_json, indent=4).encode('UTF-8'), Bucket=s3_output_bucket, Key=file_name)) 71 | 72 | print('Finished upload to S3 ' , fp) 73 | 74 | except ClientError as e: 75 | if e.response['Error']['Code'] == 'MessageFrozen': 76 | # Redirect emails are not eligible for update, handle it gracefully. 77 | print(f"Message {message_id} is not eligible for update. This is usually the case for a redirected email") 78 | else: 79 | # Send some context about this error to Lambda Logs 80 | print(e) 81 | if e.response['Error']['Code'] == 'ResourceNotFoundException': 82 | print(f"Message {message_id} does not exist. Messages in transit are no longer accessible after 1 day") 83 | elif e.response['Error']['Code'] == 'InvalidContentLocation': 84 | print('WorkMail could not access the updated email content. See https://docs.aws.amazon.com/workmail/latest/adminguide/update-with-lambda.html') 85 | raise(e) 86 | 87 | return { 88 | 'actions': [ 89 | { 90 | 'allRecipients': True, # For all recipients 91 | 'action': {'type': 'DEFAULT'} # let the email be sent normally 92 | } 93 | ] 94 | } 95 | -------------------------------------------------------------------------------- /lambda/workmail_s3/app/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-textract-idp-cdk-constructs/a681ae7847812529525bb41aa39de5fe105d3aab/lambda/workmail_s3/app/requirements.txt -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "amazon-textract-idp-cdk-constructs", 3 | "repository": { 4 | "type": "git", 5 | "url": "https://github.com/aws-samples/amazon-textract-idp-cdk-constructs.git" 6 | }, 7 | "scripts": { 8 | "build": "npx projen build", 9 | "bump": "npx projen bump", 10 | "clobber": "npx projen clobber", 11 | "compat": "npx projen compat", 12 | "compile": "npx projen compile", 13 | "default": "npx projen default", 14 | "docgen": "npx projen docgen", 15 | "eject": "npx projen eject", 16 | "eslint": "npx projen eslint", 17 | "package": "npx projen package", 18 | "package-all": "npx projen package-all", 19 | "package:java": "npx projen package:java", 20 | "package:js": "npx projen package:js", 21 | "package:python": "npx projen package:python", 22 | "post-compile": "npx projen post-compile", 23 | "post-upgrade": "npx projen post-upgrade", 24 | "pre-compile": "npx projen pre-compile", 25 | "release": "npx projen release", 26 | "test": "npx projen test", 27 | "test:watch": "npx projen test:watch", 28 | "unbump": "npx projen unbump", 29 | "upgrade": "npx projen upgrade", 30 | "watch": "npx projen watch", 31 | "projen": "npx projen" 32 | }, 33 | "author": { 34 | "name": "Martin Schade", 35 | "email": "45048633+schadem@users.noreply.github.com", 36 | "organization": false 37 | }, 38 | "devDependencies": { 39 | "@types/jest": "^29.5.12", 40 | "@types/node": "^18", 41 | "@typescript-eslint/eslint-plugin": "^6", 42 | "@typescript-eslint/parser": "^6", 43 | "aws-cdk-lib": "2.135.0", 44 | "constructs": "10.0.5", 45 | "eslint": "^8", 46 | "eslint-import-resolver-typescript": "^3.6.1", 47 | "eslint-plugin-import": "^2.29.1", 48 | "jest": "^29.7.0", 49 | "jest-junit": "^15", 50 | "jsii": "~5.3.0", 51 | "jsii-diff": "^1.96.0", 52 | "jsii-docgen": "^10.3.26", 53 | "jsii-pacmak": "^1.96.0", 54 | "jsii-rosetta": "~5.3.0", 55 | "projen": "^0.80.17", 56 | "standard-version": "^9", 57 | "ts-jest": "^29.1.2", 58 | "typescript": "^5.4.3" 59 | }, 60 | "peerDependencies": { 61 | "aws-cdk-lib": "^2.135.0", 62 | "constructs": "^10.0.5" 63 | }, 64 | "keywords": [ 65 | "amazon-textract", 66 | "aws-cdk", 67 | "cdk", 68 | "idp", 69 | "schadem", 70 | "textract" 71 | ], 72 | "main": "lib/index.js", 73 | "license": "MIT-0", 74 | "version": "0.0.0", 75 | "jest": { 76 | "testMatch": [ 77 | "/src/**/__tests__/**/*.ts?(x)", 78 | "/(test|src)/**/*(*.)@(spec|test).ts?(x)" 79 | ], 80 | "clearMocks": true, 81 | "collectCoverage": true, 82 | "coverageReporters": [ 83 | "json", 84 | "lcov", 85 | "clover", 86 | "cobertura", 87 | "text" 88 | ], 89 | "coverageDirectory": "coverage", 90 | "coveragePathIgnorePatterns": [ 91 | "/node_modules/" 92 | ], 93 | "testPathIgnorePatterns": [ 94 | "/node_modules/" 95 | ], 96 | "watchPathIgnorePatterns": [ 97 | "/node_modules/" 98 | ], 99 | "reporters": [ 100 | "default", 101 | [ 102 | "jest-junit", 103 | { 104 | "outputDirectory": "test-reports" 105 | } 106 | ] 107 | ], 108 | "transform": { 109 | "^.+\\.[t]sx?$": [ 110 | "ts-jest", 111 | { 112 | "tsconfig": "tsconfig.dev.json" 113 | } 114 | ] 115 | } 116 | }, 117 | "types": "lib/index.d.ts", 118 | "stability": "stable", 119 | "jsii": { 120 | "outdir": "dist", 121 | "targets": { 122 | "java": { 123 | "package": "software.amazon.textract.idp", 124 | "maven": { 125 | "groupId": "software.amazon.textract.idp", 126 | "artifactId": "idp-cdk-constructs" 127 | } 128 | }, 129 | "python": { 130 | "distName": "amazon-textract-idp-cdk-constructs", 131 | "module": "amazon_textract_idp_cdk_constructs" 132 | } 133 | }, 134 | "tsc": { 135 | "outDir": "lib", 136 | "rootDir": "src" 137 | } 138 | }, 139 | "//": "~~ Generated by projen. To modify, edit .projenrc.js and run \"npx projen\"." 140 | } 141 | -------------------------------------------------------------------------------- /src/cfnCustomResourceConfiguratorPrefill.ts: -------------------------------------------------------------------------------- 1 | import * as path from 'path'; 2 | import * as cdk from 'aws-cdk-lib'; 3 | //import * as dynamodb from 'aws-cdk-lib/aws-dynamodb'; 4 | import * as iam from 'aws-cdk-lib/aws-iam'; 5 | import * as lambda from 'aws-cdk-lib/aws-lambda'; 6 | import * as customResources from 'aws-cdk-lib/custom-resources'; 7 | import { Construct } from 'constructs'; 8 | 9 | 10 | export interface TextractConfigurationProps { 11 | /** @deprecated User configurationTableName in the future */ 12 | readonly configuration_table?:string; 13 | readonly configurationTableArn:string; 14 | readonly configurationTableName:string; 15 | /** Function used to initialize the DynamoDB table for the Classification Configuration 16 | * The Function has to implement CloudFormation Custom Resource https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/template-custom-resources-lambda.html 17 | */ 18 | readonly configurationInitFunction?: lambda.IFunction; 19 | } 20 | 21 | /** 22 | * CloudFormation Custom Resources importing default configuration values from default_config.csv 23 | * into DynamoDB table. 24 | * Used by the Configuration Construct to configure Textrat features. 25 | * The DynamoDB table can be configured to your requirements. 26 | * The configuration has to match the manifest format. 27 | * Essentially the Configuration construct looks up the classification 28 | * and if a match is found in the DynamoDB table, configures the context 29 | * manifest object to include the configuration. 30 | * 31 | */ 32 | export class TextractConfiguration extends Construct { 33 | public configurationInitFunction:lambda.IFunction; 34 | public response:string; 35 | 36 | constructor(scope: Construct, id: string, props: TextractConfigurationProps) { 37 | super(scope, id); 38 | 39 | if (props.configurationInitFunction === undefined) { 40 | this.configurationInitFunction = new lambda.DockerImageFunction(this, id, { 41 | code: lambda.DockerImageCode.fromImageAsset(path.join(__dirname, '../lambda/cfn_custom_configurator_prefill/')), 42 | architecture: lambda.Architecture.X86_64, 43 | memorySize: 128, 44 | timeout: cdk.Duration.seconds(600), 45 | environment: { 46 | LOG_LEVEL: 'DEBUG', 47 | CONFIGURATION_TABLE: props.configurationTableName, 48 | }, 49 | }); 50 | } else { 51 | this.configurationInitFunction = props.configurationInitFunction; 52 | } 53 | 54 | this.configurationInitFunction.addToRolePolicy( 55 | new iam.PolicyStatement({ 56 | actions: ['dynamodb:PutItem', 'dynamodb:GetItem'], 57 | resources: [props.configurationTableArn], 58 | })); 59 | 60 | const provider = new customResources.Provider(this, 'Provider', { 61 | onEventHandler: this.configurationInitFunction, 62 | }); 63 | 64 | const resource = new cdk.CustomResource(this, 'Resource', { 65 | serviceToken: provider.serviceToken, 66 | properties: props, 67 | }); 68 | 69 | this.response = resource.getAtt('Response').toString(); 70 | } 71 | } 72 | 73 | -------------------------------------------------------------------------------- /src/rdsAuroraServerless.ts: -------------------------------------------------------------------------------- 1 | //import * as ec2 from 'aws-cdk-lib/aws-ec2'; 2 | //import { ISecurityGroup, IVpc } from 'aws-cdk-lib/aws-ec2'; 3 | import * as ec2 from 'aws-cdk-lib/aws-ec2'; 4 | import * as rds from 'aws-cdk-lib/aws-rds'; 5 | import { AuroraPostgresEngineVersion } from 'aws-cdk-lib/aws-rds'; 6 | import { Construct } from 'constructs'; 7 | import { RdsServerlessInit } from './rdsServerlessInit'; 8 | 9 | export interface RDSAuroraServerlessProps { 10 | /** VPC to install the database into */ 11 | readonly vpc: ec2.IVpc; 12 | } 13 | export class RDSAuroraServerless extends Construct { 14 | 15 | public dbCluster: rds.IServerlessCluster; 16 | public auroraSecurityGroup: ec2.ISecurityGroup; 17 | public lambdaSecurityGroup: ec2.ISecurityGroup; 18 | 19 | constructor(scope: Construct, id: string, readonly props: RDSAuroraServerlessProps) { 20 | super(scope, id); 21 | 22 | this.lambdaSecurityGroup = new ec2.SecurityGroup(this, 'LambdaSG', { allowAllOutbound: true, vpc: props.vpc }); 23 | this.auroraSecurityGroup = new ec2.SecurityGroup(this, 'Aurora', { allowAllOutbound: true, vpc: props.vpc }); 24 | this.auroraSecurityGroup.addIngressRule(this.auroraSecurityGroup, ec2.Port.tcp(5432), 'fromSameSG'); 25 | this.auroraSecurityGroup.addIngressRule(this.auroraSecurityGroup, ec2.Port.tcp(443), 'fromSameSG'); 26 | this.auroraSecurityGroup.addIngressRule(this.lambdaSecurityGroup, ec2.Port.tcp(5432), 'LambdaIngreess'); 27 | this.auroraSecurityGroup.addIngressRule(this.lambdaSecurityGroup, ec2.Port.tcp(443), 'LambdaIngreess'); 28 | 29 | // AURORA 30 | this.dbCluster = new rds.DatabaseCluster(this, id + 'AuroraPSQL', { 31 | engine: rds.DatabaseClusterEngine.auroraPostgres( 32 | { 33 | version: AuroraPostgresEngineVersion.VER_15_5, 34 | }, 35 | ), 36 | writer: rds.ClusterInstance.serverlessV2('writer'), 37 | serverlessV2MinCapacity: 0.5, 38 | serverlessV2MaxCapacity: 2, 39 | vpc: props.vpc, 40 | securityGroups: [this.auroraSecurityGroup], 41 | enableDataApi: true, 42 | }); 43 | 44 | const rdsServerlessInit = new RdsServerlessInit(this, 'RdsServerlessInit', { 45 | dbClusterSecretARN: ( this.dbCluster).secret!.secretArn, 46 | dbClusterARN: ( this.dbCluster).clusterArn, 47 | }); 48 | rdsServerlessInit.node.addDependency(this.dbCluster); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/rdsServerlessInit.ts: -------------------------------------------------------------------------------- 1 | import * as path from 'path'; 2 | import * as cdk from 'aws-cdk-lib'; 3 | import * as iam from 'aws-cdk-lib/aws-iam'; 4 | import * as lambda from 'aws-cdk-lib/aws-lambda'; 5 | import * as customResources from 'aws-cdk-lib/custom-resources'; 6 | import { Construct } from 'constructs'; 7 | 8 | 9 | export interface RdsServerlessInitProps { 10 | readonly dbClusterSecretARN:string; 11 | readonly dbClusterARN: string; 12 | // readonly securityGroups:[ISecurityGroup]; 13 | } 14 | 15 | export class RdsServerlessInit extends Construct { 16 | private rdsServerlessInit:lambda.IFunction; 17 | public response:string; 18 | 19 | constructor(scope: Construct, id: string, props: RdsServerlessInitProps) { 20 | super(scope, id); 21 | 22 | this.rdsServerlessInit = new lambda.DockerImageFunction(this, id+'ServerlessInit', { 23 | code: lambda.DockerImageCode.fromImageAsset(path.join(__dirname, '../lambda/rds_serverless_init/')), 24 | memorySize: 128, 25 | architecture: lambda.Architecture.X86_64, 26 | timeout: cdk.Duration.seconds(600), 27 | // securityGroups: props.securityGroups, 28 | environment: { 29 | LOG_LEVEL: 'DEBUG', 30 | SECRET_ARN: props.dbClusterSecretARN, 31 | CLUSTER_ARN: props.dbClusterARN, 32 | }, 33 | }); 34 | 35 | this.rdsServerlessInit.role?.addManagedPolicy( 36 | iam.ManagedPolicy.fromAwsManagedPolicyName('AmazonRDSDataFullAccess')); 37 | this.rdsServerlessInit.addToRolePolicy(new iam.PolicyStatement({ 38 | actions: [ 39 | 'secretsmanager:GetSecretValue', 40 | ], 41 | resources: [props.dbClusterSecretARN], 42 | })); 43 | 44 | 45 | const provider = new customResources.Provider(this, 'Provider', { 46 | onEventHandler: this.rdsServerlessInit, 47 | }); 48 | 49 | const resource = new cdk.CustomResource(this, 'Resource', { 50 | serviceToken: provider.serviceToken, 51 | properties: props, 52 | }); 53 | 54 | this.response = resource.getAtt('Response').toString(); 55 | } 56 | } 57 | 58 | -------------------------------------------------------------------------------- /src/searchablePDF.ts: -------------------------------------------------------------------------------- 1 | import * as path from 'path'; 2 | import { Duration } from 'aws-cdk-lib'; 3 | import * as iam from 'aws-cdk-lib/aws-iam'; 4 | import * as lambda from 'aws-cdk-lib/aws-lambda'; 5 | import * as sfn from 'aws-cdk-lib/aws-stepfunctions'; 6 | import * as tasks from 'aws-cdk-lib/aws-stepfunctions-tasks'; 7 | import { Construct } from 'constructs'; 8 | 9 | export interface SearchablePDFProps { 10 | /** memory of Lambda function (may need to increase for larger documents) */ 11 | readonly lambdaMemoryMB?: number; 12 | readonly lambdaTimeout?: number; 13 | readonly searchablePDFFunction?: lambda.IFunction; 14 | readonly s3TextractOutputBucket?: string; 15 | readonly s3PDFBucket?: string; 16 | /** prefix for the incoming document. Will be used to create role */ 17 | readonly s3InputPrefix?: string; 18 | /** List of PolicyStatements to attach to the Lambda function for S3 GET and LIST. */ 19 | readonly inputPolicyStatements?: iam.PolicyStatement[]; 20 | } 21 | /** 22 | * This construct takes in a JSON with two s3 Paths, s3TextractOutput, s3PDFBucket 23 | * 24 | * example s3Path: 25 | * {"s3TextractOutput": "s3://bucketname/prefix/1"} 26 | * {"s3PDFBucket": "s3://bucketname/prefix/document.pdf"} 27 | ``` 28 | 29 | * 30 | */ 31 | export class SearchablePDF extends sfn.StateMachineFragment { 32 | public readonly startState: sfn.State; 33 | public readonly endStates: sfn.INextable[]; 34 | public readonly searchablePDFFunction: lambda.IFunction; 35 | 36 | constructor(parent: Construct, id: string, props: SearchablePDFProps) { 37 | super(parent, id); 38 | 39 | var lambdaMemoryMB = 40 | props.lambdaMemoryMB === undefined ? 1024 : props.lambdaMemoryMB; 41 | var lambdaTimeout = 42 | props.lambdaTimeout === undefined ? 900 : props.lambdaTimeout; 43 | var s3InputPrefix = 44 | props.s3InputPrefix === undefined ? '' : props.s3InputPrefix; 45 | 46 | this.searchablePDFFunction = new lambda.DockerImageFunction( 47 | this, 48 | 'SearchablePDF', 49 | { 50 | code: lambda.DockerImageCode.fromImageAsset( 51 | path.join(__dirname, '../lambda/searchablePDF/'), 52 | ), 53 | architecture: lambda.Architecture.X86_64, 54 | memorySize: lambdaMemoryMB, 55 | timeout: Duration.seconds(lambdaTimeout), 56 | }, 57 | ); 58 | 59 | if (props.inputPolicyStatements === undefined) { 60 | if (props.s3TextractOutputBucket === undefined) { 61 | this.searchablePDFFunction.addToRolePolicy( 62 | new iam.PolicyStatement({ 63 | actions: ['s3:GetObject', 's3:ListBucket', 's3:PutObject'], 64 | resources: ['*'], 65 | }), 66 | ); 67 | } else { 68 | this.searchablePDFFunction.addToRolePolicy( 69 | new iam.PolicyStatement({ 70 | actions: ['s3:GetObject', 's3:ListBucket', 's3:PutObject'], 71 | resources: [ 72 | path.join(`arn:aws:s3:::${props.s3TextractOutputBucket}`, s3InputPrefix, '/'), 73 | path.join(`arn:aws:s3:::${props.s3TextractOutputBucket}`, s3InputPrefix, '/*'), 74 | ], 75 | }), 76 | ); 77 | } 78 | } else { 79 | for (var policyStatement of props.inputPolicyStatements) { 80 | this.searchablePDFFunction.addToRolePolicy(policyStatement); 81 | } 82 | } 83 | 84 | const searchablePDFLambdaInvoke = new tasks.LambdaInvoke(this, id, { 85 | lambdaFunction: this.searchablePDFFunction, 86 | outputPath: '$.Payload', 87 | }); 88 | this.startState = searchablePDFLambdaInvoke; 89 | this.endStates = [searchablePDFLambdaInvoke]; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/textractClassificationConfigurator.ts: -------------------------------------------------------------------------------- 1 | import * as path from 'path'; 2 | import { Duration, RemovalPolicy } from 'aws-cdk-lib'; 3 | import * as dynamodb from 'aws-cdk-lib/aws-dynamodb'; 4 | import * as iam from 'aws-cdk-lib/aws-iam'; 5 | import * as lambda from 'aws-cdk-lib/aws-lambda'; 6 | import * as sfn from 'aws-cdk-lib/aws-stepfunctions'; 7 | import * as tasks from 'aws-cdk-lib/aws-stepfunctions-tasks'; 8 | import { Construct } from 'constructs'; 9 | import { TextractConfiguration } from './cfnCustomResourceConfiguratorPrefill'; 10 | 11 | export interface TextractClassificationConfiguratorProps { 12 | /** memory of Lambda function (may need to increase for larger documents) */ 13 | readonly lambdaMemoryMB?:number; 14 | readonly lambdaTimeout?:number; 15 | readonly lambdaLogLevel?:string; 16 | readonly configurationTable?:dynamodb.ITable; 17 | } 18 | 19 | /** 20 | * Looks for a matching DOCYMENT_TYPE in the configurationTableName 21 | * and sets the CONFIG value (when found) to the context, so subsequent calls to 22 | * Textract use those values. 23 | * 24 | * This is an entry from the default config 25 | * AWS_PAYSTUBS,"{""queriesConfig"": [{""alias"": ""PAYSTUB_PERIOD_START_DATE"", ""text"": ""What is the Pay Period Start Date?""}, {""alias"": ""PAYSTUB_PERIOD_END_DATE"", ""text"": ""What is the Pay Period End Date?""}, {""alias"": ""PAYSTUB_PERIOD_PAY_DATE"", ""text"": ""What is the Pay Date?""}, {""alias"": ""PAYSTUB_PERIOD_EMPLOYEE_NAME"", ""text"": ""What is the Employee Name?""}, {""alias"": ""PAYSTUB_PERIOD_COMPANY_NAME"", ""text"": ""What is the company Name?""}, {""alias"": ""PAYSTUB_PERIOD_CURRENT_GROSS_PAY"", ""text"": ""What is the Current Gross Pay?""}, {""alias"": ""PAYSTUB_PERIOD_YTD_GROSS_PAY"", ""text"": ""What is the YTD Gross Pay?""}, {""alias"": ""PAYSTUB_PERIOD_REGULAR_HOURLY_RATE"", ""text"": ""What is the regular hourly rate?""}, {""alias"": ""PAYSTUB_PERIOD_HOLIDAY_RATE"", ""text"": ""What is the holiday rate?""}], ""textractFeatures"": [""QUERIES""]}" 26 | * 27 | * So, if the "classification"."documentType" in the Step Function Input is AWS_PAYSTUBS 28 | * then it will set the queriesConfig in the manifest for the subsequent Textract Calls in the Step Function flow 29 | * 30 | * Input: "classification"."documentType" 31 | * Output: config set to manifest 32 | * 33 | * Example (Python) 34 | * ``` 35 | configurator_task = tcdk.TextractClassificationConfigurator( 36 | self, f"{workflow_name}-Configurator", 37 | ) 38 | 39 | * ``` 40 | */ 41 | export class TextractClassificationConfigurator extends sfn.StateMachineFragment { 42 | public readonly startState: sfn.State; 43 | public readonly endStates: sfn.INextable[]; 44 | public configuratorFunction:lambda.IFunction; 45 | public configurationTable:dynamodb.ITable; 46 | public configurationTableName:string; 47 | 48 | constructor(parent: Construct, id: string, props: TextractClassificationConfiguratorProps) { 49 | super(parent, id); 50 | 51 | var lambdaMemoryMB= props.lambdaMemoryMB === undefined ? 1024 : props.lambdaMemoryMB; 52 | var lambdaTimeout= props.lambdaTimeout === undefined ? 900 : props.lambdaTimeout; 53 | var lambdaLogLevel= props.lambdaLogLevel === undefined ? 'DEBUG' : props.lambdaLogLevel; 54 | 55 | if (props.configurationTable === undefined) { 56 | this.configurationTable = new dynamodb.Table(this, 'TextractConfigurationTable', { 57 | partitionKey: { 58 | name: 'DOCUMENT_TYPE', 59 | type: dynamodb.AttributeType.STRING, 60 | }, 61 | removalPolicy: RemovalPolicy.DESTROY, 62 | billingMode: dynamodb.BillingMode.PAY_PER_REQUEST, 63 | }); 64 | this.configurationTableName=this.configurationTable.tableName; 65 | const configurationInitFunction = new TextractConfiguration(this, 'DocTypeConfig', { 66 | configurationTableName: this.configurationTable.tableName, 67 | configurationTableArn: this.configurationTable.tableArn, 68 | }); 69 | configurationInitFunction.node.addDependency(this.configurationTable); 70 | } else { 71 | this.configurationTable=props.configurationTable; 72 | this.configurationTableName=props.configurationTable.tableName; 73 | } 74 | 75 | this.configuratorFunction = new lambda.DockerImageFunction(this, 'ClassificationConfigurator', { 76 | code: lambda.DockerImageCode.fromImageAsset(path.join(__dirname, '../lambda/configurator/')), 77 | memorySize: lambdaMemoryMB, 78 | architecture: lambda.Architecture.X86_64, 79 | timeout: Duration.seconds(lambdaTimeout), 80 | environment: { 81 | CONFIGURATION_TABLE: this.configurationTable.tableName, 82 | LOG_LEVEL: lambdaLogLevel, 83 | }, 84 | }); 85 | this.configuratorFunction.addToRolePolicy(new iam.PolicyStatement({ 86 | actions: ['dynamodb:PutItem', 'dynamodb:GetItem'], 87 | resources: [this.configurationTable.tableArn], 88 | })); 89 | 90 | const configuratorLambdaInvoke = new tasks.LambdaInvoke(this, id, { 91 | lambdaFunction: this.configuratorFunction, 92 | outputPath: '$.Payload', 93 | }); 94 | this.startState=configuratorLambdaInvoke; 95 | this.endStates=[configuratorLambdaInvoke]; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/textractPdfMapperForFhir.ts: -------------------------------------------------------------------------------- 1 | import * as path from 'path'; 2 | import { Duration } from 'aws-cdk-lib'; 3 | import * as iam from 'aws-cdk-lib/aws-iam'; 4 | import * as lambda from 'aws-cdk-lib/aws-lambda'; 5 | import * as sfn from 'aws-cdk-lib/aws-stepfunctions'; 6 | import * as tasks from 'aws-cdk-lib/aws-stepfunctions-tasks'; 7 | import { Construct } from 'constructs'; 8 | 9 | export interface TextractPdfMapperForFhirProps { 10 | /** memory of Lambda function (may need to increase for larger documents) */ 11 | readonly lambdaMemoryMB?: number; 12 | readonly lambdaTimeout?: number; 13 | readonly pdfMapperForFhirFunction?: lambda.IFunction; 14 | readonly lambdaLogLevel?: string; 15 | readonly healthlakeEndpoint?: string; 16 | readonly s3InputBucket?: string; 17 | /** prefix for the incoming document. Will be used to create role */ 18 | readonly s3InputPrefix?: string; 19 | /** List of PolicyStatements to attach to the Lambda function for S3 GET and LIST. */ 20 | readonly inputPolicyStatements?: iam.PolicyStatement[]; 21 | } 22 | 23 | /** 24 | * This construct takes in a manifest definition or a plain JSON with a s3Path: 25 | * 26 | * example s3Path: 27 | * {"s3Path": "s3://bucketname/prefix/image.png"} 28 | * 29 | * 30 | * Then it generated the numberOfPages attribute and the mime on the context. 31 | * The mime types checked against the supported mime types for Textract and if fails, will raise an Exception failing the workflow. 32 | * 33 | * Example (Python) 34 | * ```python 35 | decider_task_id = tcdk.TextractPOCDecider( 36 | self, 37 | f"InsuranceDecider", 38 | ) 39 | ``` 40 | 41 | * 42 | */ 43 | export class TextractPdfMapperForFhir extends sfn.StateMachineFragment { 44 | public readonly startState: sfn.State; 45 | public readonly endStates: sfn.INextable[]; 46 | public readonly pdfMapperForFhirFunction: lambda.IFunction; 47 | 48 | constructor(parent: Construct, id: string, props: TextractPdfMapperForFhirProps) { 49 | super(parent, id); 50 | 51 | var lambdaMemoryMB = props.lambdaMemoryMB === undefined ? 1024 : props.lambdaMemoryMB; 52 | var lambdaTimeout = props.lambdaTimeout === undefined ? 900 : props.lambdaTimeout; 53 | var lambdaLogLevel = props.lambdaLogLevel === undefined ? 'INFO' : props.lambdaLogLevel; 54 | var healthlakeEndpoint = props.healthlakeEndpoint === undefined ? '' : props.healthlakeEndpoint; 55 | var s3InputPrefix = props.s3InputPrefix === undefined ? 'uploads' : props.s3InputPrefix; 56 | 57 | this.pdfMapperForFhirFunction = new lambda.DockerImageFunction( 58 | this, 59 | 'TextractPdfMapperForFhir', 60 | { 61 | code: lambda.DockerImageCode.fromImageAsset( 62 | path.join(__dirname, '../lambda/pdf_mapper_for_fhir/'), 63 | ), 64 | architecture: lambda.Architecture.X86_64, 65 | memorySize: lambdaMemoryMB, 66 | timeout: Duration.seconds(lambdaTimeout), 67 | environment: { 68 | HEALTHLAKE_ENDPOINT: healthlakeEndpoint, 69 | LOG_LEVEL: lambdaLogLevel, 70 | }, 71 | }, 72 | ); 73 | 74 | if (props.inputPolicyStatements === undefined) { 75 | if (props.s3InputBucket === undefined) { 76 | this.pdfMapperForFhirFunction.addToRolePolicy( 77 | new iam.PolicyStatement({ 78 | actions: ['s3:GetObject', 's3:ListBucket'], 79 | resources: ['*'], 80 | }), 81 | ); 82 | } else { 83 | this.pdfMapperForFhirFunction.addToRolePolicy( 84 | new iam.PolicyStatement({ 85 | actions: ['s3:GetObject', 's3:ListBucket'], 86 | resources: [ 87 | path.join(`arn:aws:s3:::${props.s3InputBucket}`, s3InputPrefix, '/'), 88 | path.join(`arn:aws:s3:::${props.s3InputBucket}`, s3InputPrefix, '/*'), 89 | ], 90 | }), 91 | ); 92 | } 93 | this.pdfMapperForFhirFunction.addToRolePolicy( 94 | new iam.PolicyStatement({ 95 | actions: ['healthlake:CreateResource'], 96 | resources: ['*'], 97 | }), 98 | ); 99 | } else { 100 | for (var policyStatement of props.inputPolicyStatements) { 101 | this.pdfMapperForFhirFunction.addToRolePolicy(policyStatement); 102 | } 103 | } 104 | const pdfMapperForFhirLambdaInvoke = new tasks.LambdaInvoke(this, id, { 105 | lambdaFunction: this.pdfMapperForFhirFunction, 106 | outputPath: '$.Payload', 107 | }); 108 | this.startState = pdfMapperForFhirLambdaInvoke; 109 | this.endStates = [pdfMapperForFhirLambdaInvoke]; 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/workmailS3IngestionPoint.ts: -------------------------------------------------------------------------------- 1 | import * as path from 'path'; 2 | import { Duration } from 'aws-cdk-lib'; 3 | import { IRole, ManagedPolicy, PolicyDocument, PolicyStatement, Role, ServicePrincipal } from 'aws-cdk-lib/aws-iam'; 4 | import { CfnPermission, DockerImageFunction, Function } from 'aws-cdk-lib/aws-lambda'; 5 | import * as lambda from 'aws-cdk-lib/aws-lambda'; 6 | import { Construct } from 'constructs'; 7 | 8 | export interface WorkmailS3IngestionPointProps { 9 | /** Bucket name to output data to */ 10 | readonly s3OutputBucket:string; 11 | /** The prefix to use to output files to */ 12 | readonly s3OutputPrefix:string; 13 | /** Region where WorkMailail instance exists */ 14 | readonly workmailRegion: string; 15 | /** Account number for WorkMail instance */ 16 | readonly workmailAccountNumber: string; 17 | /** Lambda function memory configuration (may need to increase for larger documents) */ 18 | readonly lambdaMemoryMB?: number; 19 | /** Lambda function timeout (may need to increase for larger documents) */ 20 | readonly lambdaTimeout?: number; 21 | } 22 | 23 | export class WorkmailS3IngestionPoint extends Construct { 24 | private workmailAccountNumber: string; 25 | private workmailRegion: string; 26 | private s3OutputPrefix: string; 27 | private s3OutputBucket: string; 28 | constructor(scope: Construct, id: string, readonly props: WorkmailS3IngestionPointProps) { 29 | super(scope, id); 30 | const { workmailRegion, workmailAccountNumber, s3OutputBucket, s3OutputPrefix } = props; 31 | if (!workmailAccountNumber || !workmailRegion || !s3OutputBucket || !s3OutputPrefix) { 32 | throw new Error('workmailAccountNumber and workmailRegion are required'); 33 | } 34 | this.workmailAccountNumber = workmailAccountNumber; 35 | this.workmailRegion = workmailRegion; 36 | this.s3OutputBucket = props.s3OutputBucket; 37 | this.s3OutputPrefix = props.s3OutputPrefix; 38 | 39 | const WorkMailARN = `arn:aws:workmailmessageflow:${this.workmailRegion}:${this.workmailAccountNumber}:message/*/*/*`; 40 | 41 | const lambdaMemoryMB = props?.lambdaMemoryMB === undefined ? 10240 : props.lambdaMemoryMB; 42 | const lambdaTimeout = props?.lambdaTimeout === undefined ? 900 : props.lambdaTimeout; 43 | 44 | const workMailGetMsgPolicy = new PolicyDocument({ 45 | statements: [ 46 | new PolicyStatement({ 47 | actions: ['workmailmessageflow:GetRawMessageContent'], 48 | resources: [WorkMailARN], 49 | }), 50 | new PolicyStatement({ 51 | resources: [ 52 | path.join(`arn:aws:s3:::${this.s3OutputBucket}`, '/*'), 53 | path.join(`arn:aws:s3:::${this.s3OutputBucket}`, this.s3OutputPrefix, '/*'), 54 | ], 55 | actions: [ 56 | 's3:Object', 57 | 's3:PutObject', 58 | 's3:PutObjectAcl', 59 | ], 60 | }), 61 | ], 62 | }); 63 | 64 | const workmailServicesRole: IRole = new Role(this, 'WorkmailServicesRole', { 65 | assumedBy: new ServicePrincipal('amazonaws.com'), 66 | inlinePolicies: { 67 | WorkMailGetMsgPolicy: workMailGetMsgPolicy, 68 | }, 69 | managedPolicies: [ 70 | ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSLambdaBasicExecutionRole'), 71 | ], 72 | }); 73 | 74 | /** 75 | * Lambda to download files from workMail and insert them into S3 76 | */ 77 | const downloadPdfToS3Lambda: Function = new DockerImageFunction( 78 | this, 79 | 'downloadPdfToS3Lambda', 80 | { 81 | code: lambda.DockerImageCode.fromImageAsset(path.join(__dirname, '../lambda/workmail_s3/')), 82 | memorySize: lambdaMemoryMB, 83 | architecture: lambda.Architecture.X86_64, 84 | timeout: Duration.seconds(lambdaTimeout), 85 | environment: { 86 | S3_OUTPUT_BUCKET: this.s3OutputBucket, 87 | S3_OUTPUT_PREFIX: this.s3OutputPrefix, 88 | }, 89 | role: workmailServicesRole, 90 | }, 91 | ); 92 | 93 | new CfnPermission(this, 'workmailPermission', { 94 | action: 'lambda:InvokeFunction', 95 | functionName: downloadPdfToS3Lambda.functionName, 96 | principal: `workmail.${this.node.tryGetContext('region')}.amazonaws.com`, 97 | }); 98 | 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /test/comprehendClassification.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import * as sfn from 'aws-cdk-lib/aws-stepfunctions'; 4 | import { ComprehendGenericSyncSfnTask } from '../src'; 5 | 6 | let stack: Stack; 7 | beforeEach(() => { 8 | stack = new Stack(); 9 | }); 10 | 11 | describe('ClassificationTest', () => { 12 | test('ClassificationTest', () => { 13 | new ComprehendGenericSyncSfnTask(stack, 'idp-classification', { 14 | integrationPattern: sfn.IntegrationPattern.REQUEST_RESPONSE, 15 | comprehendClassifierArn: 'somearn', 16 | }); 17 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 18 | const template = Template.fromStack(stack); 19 | template.resourceCountIs('AWS::Lambda::Function', 1); 20 | }); 21 | }); 22 | -------------------------------------------------------------------------------- /test/rdsAuroraServerless.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import * as ec2 from 'aws-cdk-lib/aws-ec2'; 4 | import { RDSAuroraServerless } from '../src'; 5 | 6 | let stack: Stack; 7 | beforeEach(() => { 8 | stack = new Stack(); 9 | }); 10 | 11 | describe('AuroraServerless', () => { 12 | test('AuroraServerless', () => { 13 | new RDSAuroraServerless(stack, 'csvtoAurora', { 14 | vpc: new ec2.Vpc(stack, 'someid', { 15 | ipAddresses: ec2.IpAddresses.cidr('10.0.0.0/16'), 16 | maxAzs: 2, 17 | subnetConfiguration: [{ 18 | cidrMask: 26, 19 | name: 'isolatedSubnet', 20 | subnetType: ec2.SubnetType.PUBLIC, 21 | }, { 22 | cidrMask: 26, 23 | name: 'privateWithNat', 24 | subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS, 25 | }, { 26 | cidrMask: 26, 27 | name: 'private', 28 | subnetType: ec2.SubnetType.PRIVATE_ISOLATED, 29 | }], 30 | natGateways: 1, 31 | }), 32 | }); 33 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 34 | const template = Template.fromStack(stack); 35 | template.resourceCountIs('AWS::RDS::DBCluster', 1); 36 | template.resourceCountIs('AWS::EC2::SecurityGroup', 2); 37 | }); 38 | }); 39 | -------------------------------------------------------------------------------- /test/rdsCSVToAurora.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import * as ec2 from 'aws-cdk-lib/aws-ec2'; 4 | import * as sfn from 'aws-cdk-lib/aws-stepfunctions'; 5 | import { CSVToAuroraTask } from '../src'; 6 | 7 | let stack: Stack; 8 | beforeEach(() => { 9 | stack = new Stack(); 10 | }); 11 | 12 | describe('ClassificationTest', () => { 13 | test('ClassificationTest', () => { 14 | new CSVToAuroraTask(stack, 'csvtoAurora', { 15 | integrationPattern: sfn.IntegrationPattern.REQUEST_RESPONSE, 16 | vpc: new ec2.Vpc(stack, 'someid', { 17 | ipAddresses: ec2.IpAddresses.cidr('10.0.0.0/16'), 18 | maxAzs: 2, 19 | subnetConfiguration: [{ 20 | cidrMask: 26, 21 | name: 'isolatedSubnet', 22 | subnetType: ec2.SubnetType.PUBLIC, 23 | }, { 24 | cidrMask: 26, 25 | name: 'privateWithNat', 26 | subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS, 27 | }, { 28 | cidrMask: 26, 29 | name: 'private', 30 | subnetType: ec2.SubnetType.PRIVATE_ISOLATED, 31 | }], 32 | natGateways: 1, 33 | }), 34 | }); 35 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 36 | const template = Template.fromStack(stack); 37 | template.resourceCountIs('AWS::Lambda::Function', 3); 38 | }); 39 | }); 40 | -------------------------------------------------------------------------------- /test/spacyClassification.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import * as sfn from 'aws-cdk-lib/aws-stepfunctions'; 4 | import { SpacySfnTask } from '../src'; 5 | 6 | let stack: Stack; 7 | beforeEach(() => { 8 | stack = new Stack(); 9 | }); 10 | 11 | describe('ClassificationTest', () => { 12 | test('ClassificationTest', () => { 13 | new SpacySfnTask(stack, 'idp-spacy-classification', { 14 | integrationPattern: sfn.IntegrationPattern.REQUEST_RESPONSE, 15 | }); 16 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 17 | const template = Template.fromStack(stack); 18 | template.resourceCountIs('AWS::Lambda::Function', 2); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /test/test_csv_generator.py: -------------------------------------------------------------------------------- 1 | import trp.trp2 as t2 2 | from textractprettyprinter.t_pretty_print import convert_queries_to_list_trp2 3 | import os 4 | import json 5 | 6 | 7 | def test_queries_generate_list(caplog): 8 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 9 | input_filename = os.path.join(SCRIPT_DIR, '../Paystub_1_reMars.json') 10 | 11 | with open(input_filename) as input_doc: 12 | trp2_doc: t2.TDocument = t2.TDocumentSchema().load( 13 | json.load(input_doc)) #type: ignore 14 | queries_value_list = convert_queries_to_list_trp2( 15 | trp2_doc=trp2_doc) #type: ignore 16 | 17 | for i in queries_value_list: 18 | print(i) 19 | -------------------------------------------------------------------------------- /test/textractA2I.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import * as sfn from 'aws-cdk-lib/aws-stepfunctions'; 4 | import { TextractA2ISfnTask } from '../src/textractA2I'; 5 | 6 | let stack: Stack; 7 | beforeEach(() => { 8 | stack = new Stack(); 9 | }); 10 | 11 | describe('A2ITest', () => { 12 | test('A2ITest', () => { 13 | new TextractA2ISfnTask(stack, 'idp-a2i', { 14 | integrationPattern: sfn.IntegrationPattern.REQUEST_RESPONSE, 15 | a2iFlowDefinitionARN: 'somearn', 16 | 17 | }); 18 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 19 | const template = Template.fromStack(stack); 20 | template.resourceCountIs('AWS::Lambda::Function', 2); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /test/textractAsync.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import * as sfn from 'aws-cdk-lib/aws-stepfunctions'; 4 | import { TextractGenericAsyncSfnTask } from '../src'; 5 | 6 | let stack: Stack; 7 | beforeEach(() => { 8 | stack = new Stack(); 9 | }); 10 | 11 | describe('TextractAsyncTest', () => { 12 | test('TextractAsyncTest', () => { 13 | new TextractGenericAsyncSfnTask(stack, 'async', { 14 | integrationPattern: sfn.IntegrationPattern.REQUEST_RESPONSE, 15 | s3OutputBucket: 'somebucket', 16 | s3TempOutputPrefix: 'sometempoutputprefix', 17 | }); 18 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 19 | const template = Template.fromStack(stack); 20 | template.resourceCountIs('AWS::Lambda::Function', 2); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /test/textractClassificationConfigurator.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import { TextractClassificationConfigurator } from '../src'; 4 | 5 | let stack: Stack; 6 | beforeEach(() => { 7 | stack = new Stack(); 8 | }); 9 | 10 | describe('ClassificationTest', () => { 11 | test('ClassificationTest', () => { 12 | new TextractClassificationConfigurator(stack, 'textractConfiguration', { 13 | }); 14 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 15 | const template = Template.fromStack(stack); 16 | template.resourceCountIs('AWS::Lambda::Function', 3); 17 | }); 18 | }); 19 | -------------------------------------------------------------------------------- /test/textractDecider.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import { TextractPOCDecider } from '../src'; 4 | 5 | let stack: Stack; 6 | beforeEach(() => { 7 | stack = new Stack(); 8 | }); 9 | 10 | describe('ClassificationTest', () => { 11 | test('ClassificationTest', () => { 12 | new TextractPOCDecider(stack, 'idp-decider', { 13 | }); 14 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 15 | const template = Template.fromStack(stack); 16 | template.resourceCountIs('AWS::Lambda::Function', 1); 17 | }); 18 | }); 19 | -------------------------------------------------------------------------------- /test/textractGenerateCsv.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import { TextractGenerateCSV } from '../src'; 4 | 5 | let stack: Stack; 6 | beforeEach(() => { 7 | stack = new Stack(); 8 | }); 9 | 10 | describe('GenerateCSVTest', () => { 11 | test('GenerateCSVTest', () => { 12 | new TextractGenerateCSV(stack, 'idp-generate-csv', { 13 | csvS3OutputBucket: 'somebucket', 14 | csvS3OutputPrefix: 'someprefix', 15 | }); 16 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 17 | const template = Template.fromStack(stack); 18 | template.resourceCountIs('AWS::Lambda::Function', 1); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /test/textractOutputConfigToJSON.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import { TextractAsyncToJSON } from '../src'; 4 | 5 | let stack: Stack; 6 | beforeEach(() => { 7 | stack = new Stack(); 8 | }); 9 | 10 | describe('ClassificationTest', () => { 11 | test('ClassificationTest', () => { 12 | new TextractAsyncToJSON(stack, 'idp-classification', { 13 | s3OutputBucket: 'somebucket', 14 | s3OutputPrefix: 'someprefix', 15 | }); 16 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 17 | const template = Template.fromStack(stack); 18 | template.resourceCountIs('AWS::Lambda::Function', 1); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /test/textractPdfMapperForFhir.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import { TextractPdfMapperForFhir } from '../src'; 4 | 5 | let stack: Stack; 6 | beforeEach(() => { 7 | stack = new Stack(); 8 | }); 9 | 10 | describe('ClassificationTest', () => { 11 | test('ClassificationTest', () => { 12 | new TextractPdfMapperForFhir(stack, 'idp-decider', { 13 | }); 14 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 15 | const template = Template.fromStack(stack); 16 | template.resourceCountIs('AWS::Lambda::Function', 1); 17 | }); 18 | }); 19 | -------------------------------------------------------------------------------- /test/textractSync.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import * as sfn from 'aws-cdk-lib/aws-stepfunctions'; 4 | import { TextractGenericSyncSfnTask } from '../src'; 5 | 6 | let stack: Stack; 7 | beforeEach(() => { 8 | stack = new Stack(); 9 | }); 10 | 11 | describe('TextractSyncTest', () => { 12 | test('TextractSyncTest', () => { 13 | new TextractGenericSyncSfnTask(stack, 'idp-textract-sync-test', { 14 | integrationPattern: sfn.IntegrationPattern.REQUEST_RESPONSE, 15 | s3OutputBucket: 'somebucket', 16 | s3OutputPrefix: 'someprefix', 17 | }); 18 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 19 | const template = Template.fromStack(stack); 20 | template.resourceCountIs('AWS::Lambda::Function', 1); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /test/workmailS3IngestionPoint.test.ts: -------------------------------------------------------------------------------- 1 | import { Stack } from 'aws-cdk-lib'; 2 | import { Template } from 'aws-cdk-lib/assertions'; 3 | import { WorkmailS3IngestionPoint } from '../src'; 4 | 5 | let stack: Stack; 6 | beforeEach(() => { 7 | stack = new Stack(); 8 | }); 9 | 10 | describe('IntegrationWorkmailTest', () => { 11 | test('Workmail Test', () => { 12 | new WorkmailS3IngestionPoint(stack, 'idp-classification', { 13 | s3OutputPrefix: 'sometempoutputprefix', 14 | workmailAccountNumber: 'someWorkmailAccountNumber', 15 | workmailRegion: 'us-east-1', 16 | s3OutputBucket: 'somebucket', 17 | }); 18 | expect(Template.fromStack(stack).toJSON()).toMatchSnapshot(); 19 | const template = Template.fromStack(stack); 20 | template.resourceCountIs('AWS::Lambda::Function', 1); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /tsconfig.dev.json: -------------------------------------------------------------------------------- 1 | // ~~ Generated by projen. To modify, edit .projenrc.js and run "npx projen". 2 | { 3 | "compilerOptions": { 4 | "alwaysStrict": true, 5 | "declaration": true, 6 | "esModuleInterop": true, 7 | "experimentalDecorators": true, 8 | "inlineSourceMap": true, 9 | "inlineSources": true, 10 | "lib": [ 11 | "es2019" 12 | ], 13 | "module": "CommonJS", 14 | "noEmitOnError": false, 15 | "noFallthroughCasesInSwitch": true, 16 | "noImplicitAny": true, 17 | "noImplicitReturns": true, 18 | "noImplicitThis": true, 19 | "noUnusedLocals": true, 20 | "noUnusedParameters": true, 21 | "resolveJsonModule": true, 22 | "strict": true, 23 | "strictNullChecks": true, 24 | "strictPropertyInitialization": true, 25 | "stripInternal": true, 26 | "target": "ES2019" 27 | }, 28 | "include": [ 29 | "src/**/*.ts", 30 | "test/**/*.ts", 31 | ".projenrc.js" 32 | ], 33 | "exclude": [ 34 | "node_modules" 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /update_dependencies_local.sh: -------------------------------------------------------------------------------- 1 | # Textract CDK Constructs 2 | python -m pip install --force-reinstall /Users/schadem/code/github/aws-samples/amazon-textract-idp-cdk-constructs/dist/python/amazon_textract_idp_cdk_constructs-0.0.0-py3-none-any.whl 3 | 4 | --------------------------------------------------------------------------------