├── .github ├── FUNDING.yml └── dependabot.yml ├── .gitignore ├── .prettierrc ├── .vscode ├── extensions.json └── settings.json ├── LICENSE ├── README.md ├── cdk.json ├── index.ts ├── jest.config.js ├── package.json ├── src ├── __tests__ │ └── test_stack.ts ├── download-pdf-to-s3 │ └── lambda.ts ├── scrape-pdfs-from-website │ ├── lambda.ts │ ├── package.json │ └── yarn.lock ├── send-pdf-to-textract │ └── lambda.ts ├── send-textract-result-to-dynamo │ └── lambda.ts └── stack.ts ├── tsconfig.json └── yarn.lock /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | github: aeksco 3 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "npm" 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | allow: 13 | - dependency-name: "*" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | src/**/*.js 2 | index.js 3 | node_modules/ 4 | cdk.out 5 | data/ -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 80, 3 | "tabWidth": 4 4 | } 5 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=827846 to learn about workspace recommendations. 3 | // Extension identifier format: ${publisher}.${name}. Example: vscode.csharp 4 | 5 | // List of extensions which should be recommended for users of this workspace. 6 | "recommendations": ["esbenp.prettier-vscode"], 7 | // List of extensions recommended by VS Code that should not be recommended for users of this workspace. 8 | "unwantedRecommendations": ["ms-vscode.vscode-typescript-next"] 9 | } 10 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "editor.defaultFormatter": "esbenp.prettier-vscode", 4 | "files.exclude": { 5 | "index.js": true, 6 | "src/**/*.js": true, 7 | "**/node_modules/**": true 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020-2022 Alexander Schwartzberg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # aws-pdf-textract-pipeline [![Mentioned in Awesome CDK](https://awesome.re/mentioned-badge.svg)](https://github.com/kolomied/awesome-cdk) 2 | 3 | :mag: Data pipeline for crawling PDFs from the Web and transforming their contents into structured data using [AWS Textract](https://aws.amazon.com/textract/). Built with AWS CDK + TypeScript. 4 | 5 | This is an example data pipeline that illustrates one possible approach for large-scale serverless PDF processing - it should serve as a good foundation to modify for your own purposes. 6 | 7 | ![Example Extension Popup](https://i.imgur.com/3F89JQK.png "Example Extension Popup") 8 | 9 | 10 | 11 | **Getting Started** 12 | 13 | Run the following commands to install dependencies, build the CDK stack, and deploy the CDK Stack to AWS. 14 | 15 | ``` 16 | yarn install 17 | yarn build 18 | cdk bootstrap 19 | cdk deploy 20 | ``` 21 | 22 | ### Overview 23 | 24 | The following is an overview of each process performed by this CDK stack. 25 | 26 | 1. **Scrape PDF download URLs from a website** 27 | 28 | Scraping data from the [COGCC](https://cogcc.state.co.us/) website. 29 | 30 | 2. **Store PDF download URL in DynamoDB** 31 | 32 | ![Example Extension Popup](https://i.imgur.com/bmFJGDW.png "Example Extension Popup") 33 | 34 | 3. **Download the PDF to S3** 35 | 36 | A lambda fires off when a new PDF download URL has been created in DynamoDB. 37 | 38 | 4. **Process the PDF with AWS Textract** 39 | 40 | Another lambda fires off when a PDF has been downloaded to the S3 bucket. 41 | 42 | 5. **Process the AWS Textract results** 43 | 44 | When an SNS event is detected from AWS Textract, a lambda is fired off to process the result. 45 | 46 | 6. **Save the processed Textract result to DynamoDB.** 47 | 48 | After the full result is pruned down the the desired datastructure, we save the data in DynamoDB. 49 | ![Example Extension Popup](https://i.imgur.com/HkTtLmi.png "Example Extension Popup") 50 | 51 | ### Scripts 52 | 53 | - `yarn install` - installs dependencies 54 | - `yarn build` - builds the production-ready CDK Stack 55 | - `yarn test` - runs Jest 56 | - `cdk bootstrap` - bootstraps AWS Cloudformation for your CDK deploy 57 | - `cdk deploy` - deploys the CDK stack to AWS 58 | 59 | **Notes** 60 | 61 | - **Warning** - the `AnalyzeDocument` process from AWS Textract costs \$50 per 1,000 PDF pages. Be careful when deploying this CDK stack as you could unintentionally rack up an expensive AWS bill quickly if you're not paying attention. 62 | 63 | - If a PDF download URL has already been added to the `pdfUrlsTable` DynamoDB table, the pipeline will not re-execute for the PDF. 64 | 65 | - Includes tests with Jest. 66 | 67 | - Recommended to use `Visual Studio Code` with the `Format on Save` setting turned on. 68 | 69 | **Built with** 70 | 71 | - [TypeScript](https://www.typescriptlang.org/) 72 | - [Jest](https://jestjs.io) 73 | - [Puppeteer](https://jestjs.io) 74 | - [AWS CDK](https://aws.amazon.com/cdk/) 75 | - [AWS Lambda](https://aws.amazon.com/lambda/) 76 | - [AWS SNS](https://aws.amazon.com/sns/) 77 | - [AWS DynamoDB](https://aws.amazon.com/dynamodb/) 78 | - [AWS S3](https://aws.amazon.com/s3/) 79 | 80 | **Additional Resources** 81 | 82 | - [CDK API Reference](https://docs.aws.amazon.com/cdk/api/latest/docs/aws-construct-library.html) 83 | - [Puppeteer](https://github.com/puppeteer/puppeteer) 84 | - [Puppeteer Lambda](https://github.com/alixaxel/chrome-aws-lambda) 85 | - [CDK TypeScript Reference](https://docs.aws.amazon.com/cdk/api/latest/typescript/api/index.html) 86 | - [CDK Assertion Package](https://github.com/aws/aws-cdk/tree/master/packages/%40aws-cdk/assert) 87 | - [Textract Pricing Chart](https://aws.amazon.com/textract/pricing/) 88 | - [awesome-cdk repo](https://github.com/eladb/awesome-cdk) 89 | 90 | **License** 91 | 92 | Opens source under the MIT License. 93 | 94 | Built with :heart: by [aeksco](https://twitter.com/aeksco) 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "node index" 3 | } 4 | -------------------------------------------------------------------------------- /index.ts: -------------------------------------------------------------------------------- 1 | import { App } from "aws-cdk-lib"; 2 | import { PdfTextractPipeline } from "./src/stack"; 3 | 4 | // // // // 5 | 6 | // Defines new CDK App 7 | const app = new App(); 8 | 9 | // Instantiates the PdfTextractPipeline 10 | new PdfTextractPipeline(app, "PdfTextractPipeline"); 11 | app.synth(); 12 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | roots: ["./src"], 3 | testMatch: ["**/*.test.ts", "**/__tests__/*.ts"], 4 | transform: { 5 | "^.+\\.tsx?$": "ts-jest" 6 | }, 7 | modulePathIgnorePatterns: ["^.+\\.d.ts?$"] 8 | }; 9 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "aws-pdf-textract-pipeline", 3 | "version": "2.0.0", 4 | "description": "Scheduled pipline for AWS Textract + Lambda", 5 | "private": true, 6 | "scripts": { 7 | "build": "tsc", 8 | "test": "jest", 9 | "watch": "tsc -w", 10 | "cdk": "cdk", 11 | "prettify": "prettier -w ./src" 12 | }, 13 | "license": "MIT", 14 | "devDependencies": { 15 | "@aws-cdk/assert": "^2.68.0", 16 | "@types/jest": "^26.0.10", 17 | "@types/node": "^20.10.4", 18 | "constructs": "^10.1.300", 19 | "jest": "^26.4.2", 20 | "prettier": "^3.0.3", 21 | "ts-jest": "^26.2.0", 22 | "typescript": "^4.9.5" 23 | }, 24 | "dependencies": { 25 | "@types/puppeteer": "^7.0.4", 26 | "aws-cdk-lib": "^2.133.0", 27 | "puppeteer-lambda": "^1.1.3" 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/__tests__/test_stack.ts: -------------------------------------------------------------------------------- 1 | import { expect as expectCDK, countResources } from "@aws-cdk/assert"; 2 | import * as cdk from "aws-cdk-lib/core"; 3 | import { PdfTextractPipeline } from "../stack"; 4 | 5 | // // // // 6 | 7 | describe("PdfTextractPipeline", () => { 8 | test("loads", () => { 9 | const app = new cdk.App(); 10 | 11 | // Configures CDK stack 12 | const stack: cdk.Stack = new PdfTextractPipeline( 13 | app, 14 | "PdfTextractPipeline" 15 | ); 16 | 17 | // Checks stack resource count 18 | expectCDK(stack).to(countResources("AWS::DynamoDB::Table", 2)); 19 | expectCDK(stack).to(countResources("AWS::Events::Rule", 1)); 20 | expectCDK(stack).to(countResources("AWS::IAM::Policy", 6)); 21 | expectCDK(stack).to(countResources("AWS::IAM::Role", 6)); 22 | expectCDK(stack).to( 23 | countResources("AWS::Lambda::EventSourceMapping", 1) 24 | ); 25 | expectCDK(stack).to(countResources("AWS::Lambda::Function", 5)); 26 | expectCDK(stack).to(countResources("AWS::Lambda::Permission", 3)); 27 | expectCDK(stack).to(countResources("AWS::SNS::Subscription", 1)); 28 | expectCDK(stack).to(countResources("AWS::SNS::Topic", 1)); 29 | expectCDK(stack).to(countResources("Custom::S3BucketNotifications", 1)); 30 | }); 31 | }); 32 | -------------------------------------------------------------------------------- /src/download-pdf-to-s3/lambda.ts: -------------------------------------------------------------------------------- 1 | // event data: 2 | // { 3 | // "Records": [ 4 | // { 5 | // "eventID": "4cff18fbf6c5a7fb9db7008add5af874", 6 | // "eventName": "INSERT", 7 | // "eventVersion": "1.1", 8 | // "eventSource": "aws:dynamodb", 9 | // "awsRegion": "us-east-1", 10 | // "dynamodb": { 11 | // "ApproximateCreationDateTime": 1582405136, 12 | // "Keys": { 13 | // "itemId": { 14 | // "S": "5055310" 15 | // } 16 | // }, 17 | // "NewImage": { 18 | // "date": { 19 | // "S": "02/03/2020" 20 | // }, 21 | // "itemId": { 22 | // "S": "5055310" 23 | // }, 24 | // "documentType": { 25 | // "S": "WELL ABANDONMENT REPORT (INTENT)" 26 | // }, 27 | // "downloadUrl": { 28 | // "S": "http://ogccweblink.state.co.us/DownloadDocumentPDF.aspx?DocumentId=5055310" 29 | // } 30 | // }, 31 | // "SequenceNumber": "89300000000054786958328", 32 | // "SizeBytes": 169, 33 | // "StreamViewType": "NEW_IMAGE" 34 | // }, 35 | // "eventSourceARN": "arn:aws:dynamodb:us-east-1:839811712080:table/cogcc-pdf-urls/stream/2020-02-22T20:53:55.247" 36 | // } 37 | // ] 38 | // } 39 | 40 | import * as http from "http"; 41 | import * as fs from "fs"; 42 | import * as AWS from "aws-sdk"; 43 | const s3obj = new AWS.S3(); 44 | const S3_BUCKET_NAME = process.env.S3_BUCKET_NAME || ""; 45 | 46 | // // // // 47 | 48 | // Downloads a file from a URL and writes it to `./tmp/filename 49 | function downloadFile(url: string, dest: string): Promise { 50 | return new Promise((resolve) => { 51 | const file = fs.createWriteStream(dest); 52 | console.log("created file write stream: " + dest); 53 | 54 | // Fetches URL using HTTP 55 | http.get(url, (response) => { 56 | // Logs downloaded file message 57 | console.log("downloaded file: " + url); 58 | 59 | // Pipes response to file 60 | response.pipe(file); 61 | 62 | // Defines callback for stream "finish" event 63 | file.on("finish", function () { 64 | // Logs wrote-to-file message 65 | console.log("wrote to file: " + dest); 66 | 67 | // Closes file stream and resolves promise 68 | file.close(); 69 | resolve(); 70 | }); 71 | }); 72 | }); 73 | } 74 | 75 | // // // // 76 | 77 | export const handler = async (event: any = {}): Promise => { 78 | // Logs start message + S3_BUCKET_NAME 79 | console.log("download-file --> START"); 80 | console.log(`writing to S3 bucket: ${S3_BUCKET_NAME}`); 81 | 82 | // Debug event input 83 | console.log(JSON.stringify(event, null, 4)); 84 | 85 | // Pulls newItem from event 86 | const newItem = event["Records"][0]["dynamodb"]["NewImage"]; 87 | if (!newItem) { 88 | return; 89 | } 90 | 91 | // Pulls downloadUrl from newItem 92 | const downloadUrl = newItem["downloadUrl"]["S"]; 93 | const documentId = newItem["itemId"]["S"]; 94 | if (!downloadUrl) { 95 | return; 96 | } 97 | 98 | // Defines filename - used to save locally to lambda (in /tmp) AND in S3 bucket 99 | const filename = documentId + ".pdf"; 100 | const filepath = "/tmp/" + filename; 101 | 102 | // Logs downloadUrl 103 | console.log(`downloadUrl: ${downloadUrl}`); 104 | console.log(`documentId: ${documentId}`); 105 | console.log(`filepath: ${filepath}`); 106 | 107 | // Downloads file to /tmp 108 | await downloadFile(downloadUrl, filepath); 109 | 110 | // Saves new file to S3 111 | s3obj 112 | .upload({ 113 | Bucket: S3_BUCKET_NAME, 114 | Key: documentId + ".pdf", 115 | Body: fs.readFileSync(filepath), 116 | }) 117 | .send((err, data) => { 118 | console.log(err, data); 119 | // Logs error 120 | if (err) { 121 | console.log(`download-file --> ERROR`); 122 | console.log(err); 123 | return; 124 | } 125 | console.log(`download-file --> SUCCESS --> ${filename}`); 126 | }); 127 | }; 128 | -------------------------------------------------------------------------------- /src/scrape-pdfs-from-website/lambda.ts: -------------------------------------------------------------------------------- 1 | import * as chromium from "chrome-aws-lambda"; 2 | import * as AWS from "aws-sdk"; 3 | const db = new AWS.DynamoDB.DocumentClient(); 4 | const TABLE_NAME = process.env.TABLE_NAME || ""; 5 | const PRIMARY_KEY = process.env.PRIMARY_KEY || ""; 6 | 7 | // // // // 8 | 9 | /** 10 | * buildFetchUrl 11 | * Builds a url to the page with all the PDF download URLs 12 | */ 13 | function buildFetchUrl(): string { 14 | // The URL from which the PDF download URLs are being fetched 15 | const baseUrl = 16 | "http://ogccweblink.state.co.us/Results.aspx?DocName=WELL%20ABANDONMENT%20REPORT%20(INTENT)&DocDate=02/03/2020"; 17 | 18 | // Returns base URL with date param 19 | return baseUrl; 20 | } 21 | 22 | // // // // 23 | 24 | export const handler = async ( 25 | event: any = {}, 26 | context: any = {} 27 | ): Promise => { 28 | // Log start message 29 | console.log("scrape-pdfs-from-website -> start"); 30 | console.log(event); 31 | 32 | // Define 33 | let result = null; 34 | let browser = null; 35 | 36 | try { 37 | // Defines browser 38 | browser = await chromium.puppeteer.launch({ 39 | args: chromium.args, 40 | defaultViewport: chromium.defaultViewport, 41 | executablePath: await chromium.executablePath, 42 | headless: chromium.headless, 43 | }); 44 | 45 | // Defines page 46 | let page = await browser.newPage(); 47 | 48 | // Gets fetchUrl for puppeteer 49 | // This is the page with all the PDF download URLs 50 | const fetchUrl: string = buildFetchUrl(); 51 | 52 | // Navigate to page, wait until dom content is loaded 53 | await page.goto(fetchUrl, { 54 | waitUntil: "domcontentloaded", 55 | }); 56 | 57 | // Gets ALL urls 58 | let allHrefs = await page.$$eval("a", (as) => 59 | // @ts-ignore 60 | as.map((a: Element) => a.href) 61 | ); 62 | 63 | // Gets Download URLS 64 | // @ts-ignore 65 | let downloadUrls = allHrefs.filter((a) => 66 | a.includes("DownloadDocumentPDF") 67 | ); 68 | 69 | // Logs downloadUrls 70 | console.log("downloadUrls"); 71 | console.log(downloadUrls); 72 | 73 | // Insert downloadURLs into DynamoDO 74 | // NOTE - we only insert the first 5 items into DynamoDB to curb 75 | // unindended AWS spend associated with running Textract against PDFs. 76 | await Promise.all( 77 | downloadUrls 78 | .slice(0, 5) 79 | .map((downloadUrl: string): Promise => { 80 | // Pulls documentId from downloadUrl 81 | const documentId: string = String( 82 | downloadUrl.split("DocumentId=").pop() 83 | ); 84 | 85 | // Defines the item we're inserting into the database 86 | const item: any = { 87 | [PRIMARY_KEY]: documentId, 88 | documentType: "WELL ABANDONMENT REPORT (INTENT)", 89 | date: "02/03/2020", 90 | downloadUrl: downloadUrl, 91 | }; 92 | 93 | // Defines the params for db.put 94 | const params = { 95 | TableName: TABLE_NAME, 96 | Item: item, 97 | }; 98 | 99 | // Inserts the record into the DynamoDB table 100 | return db.put(params).promise(); 101 | }) 102 | ); 103 | } catch (error) { 104 | return context.fail(error); 105 | } finally { 106 | // Close the puppeteer browser 107 | if (browser !== null) { 108 | await browser.close(); 109 | } 110 | } 111 | 112 | // Logs "shutdown" statement 113 | console.log("scrape-pdfs-from-website -> shutdown"); 114 | return context.succeed(result); 115 | }; 116 | -------------------------------------------------------------------------------- /src/scrape-pdfs-from-website/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scrape-pdfs-lambda", 3 | "version": "1.0.0", 4 | "description": "Scrapes PDF download URLs using Puppeteer", 5 | "private": true, 6 | "dependencies": { 7 | "chrome-aws-lambda": "^2.1.1", 8 | "puppeteer-core": "^2.1.1" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/scrape-pdfs-from-website/yarn.lock: -------------------------------------------------------------------------------- 1 | # THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. 2 | # yarn lockfile v1 3 | 4 | 5 | "@types/mime-types@^2.1.0": 6 | version "2.1.0" 7 | resolved "https://registry.yarnpkg.com/@types/mime-types/-/mime-types-2.1.0.tgz#9ca52cda363f699c69466c2a6ccdaad913ea7a73" 8 | integrity sha1-nKUs2jY/aZxpRmwqbM2q2RPqenM= 9 | 10 | agent-base@5: 11 | version "5.1.1" 12 | resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-5.1.1.tgz#e8fb3f242959db44d63be665db7a8e739537a32c" 13 | integrity sha512-TMeqbNl2fMW0nMjTEPOwe3J/PRFP4vqeoNuQMG0HlMrtm5QxKqdvAkZ1pRBQ/ulIyDD5Yq0nJ7YbdD8ey0TO3g== 14 | 15 | async-limiter@~1.0.0: 16 | version "1.0.1" 17 | resolved "https://registry.yarnpkg.com/async-limiter/-/async-limiter-1.0.1.tgz#dd379e94f0db8310b08291f9d64c3209766617fd" 18 | integrity sha512-csOlWGAcRFJaI6m+F2WKdnMKr4HhdhFVBk0H/QbJFMCr+uO2kwohwXQPxw/9OCxp05r5ghVBFSyioixx3gfkNQ== 19 | 20 | balanced-match@^1.0.0: 21 | version "1.0.2" 22 | resolved "https://registry.yarnpkg.com/balanced-match/-/balanced-match-1.0.2.tgz#e83e3a7e3f300b34cb9d87f615fa0cbf357690ee" 23 | integrity sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw== 24 | 25 | bl@^3.0.0: 26 | version "3.0.1" 27 | resolved "https://registry.yarnpkg.com/bl/-/bl-3.0.1.tgz#1cbb439299609e419b5a74d7fce2f8b37d8e5c6f" 28 | integrity sha512-jrCW5ZhfQ/Vt07WX1Ngs+yn9BDqPL/gw28S7s9H6QK/gupnizNzJAss5akW20ISgOrbLTlXOOCTJeNUQqruAWQ== 29 | dependencies: 30 | readable-stream "^3.0.1" 31 | 32 | brace-expansion@^1.1.7: 33 | version "1.1.11" 34 | resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd" 35 | integrity sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA== 36 | dependencies: 37 | balanced-match "^1.0.0" 38 | concat-map "0.0.1" 39 | 40 | buffer-from@^1.0.0: 41 | version "1.1.1" 42 | resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.1.tgz#32713bc028f75c02fdb710d7c7bcec1f2c6070ef" 43 | integrity sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A== 44 | 45 | chownr@^1.1.1: 46 | version "1.1.4" 47 | resolved "https://registry.yarnpkg.com/chownr/-/chownr-1.1.4.tgz#6fc9d7b42d32a583596337666e7d08084da2cc6b" 48 | integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg== 49 | 50 | chrome-aws-lambda@^2.1.1: 51 | version "2.1.1" 52 | resolved "https://registry.yarnpkg.com/chrome-aws-lambda/-/chrome-aws-lambda-2.1.1.tgz#2aeb0c97fb67e908d06dc8d92d92c7d4fb58467c" 53 | integrity sha512-Wer2QuygxsCov5bM2+8CLa6qYpNsc5AxYTlgTne00aFoxFP491LGJRxOQtGnYtsJP6UG4pB0SfrwTyPnLys1Lw== 54 | dependencies: 55 | lambdafs "^1.3.0" 56 | 57 | concat-map@0.0.1: 58 | version "0.0.1" 59 | resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b" 60 | integrity sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg== 61 | 62 | concat-stream@1.6.2: 63 | version "1.6.2" 64 | resolved "https://registry.yarnpkg.com/concat-stream/-/concat-stream-1.6.2.tgz#904bdf194cd3122fc675c77fc4ac3d4ff0fd1a34" 65 | integrity sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw== 66 | dependencies: 67 | buffer-from "^1.0.0" 68 | inherits "^2.0.3" 69 | readable-stream "^2.2.2" 70 | typedarray "^0.0.6" 71 | 72 | core-util-is@~1.0.0: 73 | version "1.0.2" 74 | resolved "https://registry.yarnpkg.com/core-util-is/-/core-util-is-1.0.2.tgz#b5fd54220aa2bc5ab57aab7140c940754503c1a7" 75 | integrity sha1-tf1UIgqivFq1eqtxQMlAdUUDwac= 76 | 77 | debug@2.6.9: 78 | version "2.6.9" 79 | resolved "https://registry.yarnpkg.com/debug/-/debug-2.6.9.tgz#5d128515df134ff327e90a4c93f4e077a536341f" 80 | integrity sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA== 81 | dependencies: 82 | ms "2.0.0" 83 | 84 | debug@4, debug@^4.1.0: 85 | version "4.1.1" 86 | resolved "https://registry.yarnpkg.com/debug/-/debug-4.1.1.tgz#3b72260255109c6b589cee050f1d516139664791" 87 | integrity sha512-pYAIzeRo8J6KPEaJ0VWOh5Pzkbw/RetuzehGM7QRRX5he4fPHx2rdKMB256ehJCkX+XRQm16eZLqLNS8RSZXZw== 88 | dependencies: 89 | ms "^2.1.1" 90 | 91 | end-of-stream@^1.1.0, end-of-stream@^1.4.1: 92 | version "1.4.4" 93 | resolved "https://registry.yarnpkg.com/end-of-stream/-/end-of-stream-1.4.4.tgz#5ae64a5f45057baf3626ec14da0ca5e4b2431eb0" 94 | integrity sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q== 95 | dependencies: 96 | once "^1.4.0" 97 | 98 | extract-zip@^1.6.6: 99 | version "1.6.7" 100 | resolved "https://registry.yarnpkg.com/extract-zip/-/extract-zip-1.6.7.tgz#a840b4b8af6403264c8db57f4f1a74333ef81fe9" 101 | integrity sha1-qEC0uK9kAyZMjbV/Txp0Mz74H+k= 102 | dependencies: 103 | concat-stream "1.6.2" 104 | debug "2.6.9" 105 | mkdirp "0.5.1" 106 | yauzl "2.4.1" 107 | 108 | fd-slicer@~1.0.1: 109 | version "1.0.1" 110 | resolved "https://registry.yarnpkg.com/fd-slicer/-/fd-slicer-1.0.1.tgz#8b5bcbd9ec327c5041bf9ab023fd6750f1177e65" 111 | integrity sha1-i1vL2ewyfFBBv5qwI/1nUPEXfmU= 112 | dependencies: 113 | pend "~1.2.0" 114 | 115 | fs-constants@^1.0.0: 116 | version "1.0.0" 117 | resolved "https://registry.yarnpkg.com/fs-constants/-/fs-constants-1.0.0.tgz#6be0de9be998ce16af8afc24497b9ee9b7ccd9ad" 118 | integrity sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow== 119 | 120 | fs.realpath@^1.0.0: 121 | version "1.0.0" 122 | resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f" 123 | integrity sha1-FQStJSMVjKpA20onh8sBQRmU6k8= 124 | 125 | glob@^7.1.3: 126 | version "7.1.6" 127 | resolved "https://registry.yarnpkg.com/glob/-/glob-7.1.6.tgz#141f33b81a7c2492e125594307480c46679278a6" 128 | integrity sha512-LwaxwyZ72Lk7vZINtNNrywX0ZuLyStrdDtabefZKAY5ZGJhVtgdznluResxNmPitE0SAO+O26sWTHeKSI2wMBA== 129 | dependencies: 130 | fs.realpath "^1.0.0" 131 | inflight "^1.0.4" 132 | inherits "2" 133 | minimatch "^3.0.4" 134 | once "^1.3.0" 135 | path-is-absolute "^1.0.0" 136 | 137 | https-proxy-agent@^4.0.0: 138 | version "4.0.0" 139 | resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-4.0.0.tgz#702b71fb5520a132a66de1f67541d9e62154d82b" 140 | integrity sha512-zoDhWrkR3of1l9QAL8/scJZyLu8j/gBkcwcaQOZh7Gyh/+uJQzGVETdgT30akuwkpL8HTRfssqI3BZuV18teDg== 141 | dependencies: 142 | agent-base "5" 143 | debug "4" 144 | 145 | inflight@^1.0.4: 146 | version "1.0.6" 147 | resolved "https://registry.yarnpkg.com/inflight/-/inflight-1.0.6.tgz#49bd6331d7d02d0c09bc910a1075ba8165b56df9" 148 | integrity sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk= 149 | dependencies: 150 | once "^1.3.0" 151 | wrappy "1" 152 | 153 | inherits@2, inherits@^2.0.3, inherits@~2.0.3: 154 | version "2.0.4" 155 | resolved "https://registry.yarnpkg.com/inherits/-/inherits-2.0.4.tgz#0fa2c64f932917c3433a0ded55363aae37416b7c" 156 | integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ== 157 | 158 | isarray@~1.0.0: 159 | version "1.0.0" 160 | resolved "https://registry.yarnpkg.com/isarray/-/isarray-1.0.0.tgz#bb935d48582cba168c06834957a54a3e07124f11" 161 | integrity sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE= 162 | 163 | lambdafs@^1.3.0: 164 | version "1.3.0" 165 | resolved "https://registry.yarnpkg.com/lambdafs/-/lambdafs-1.3.0.tgz#7e369cedc9a09623bb365fa99a1113c2ab2fc7ae" 166 | integrity sha512-HqRPmEgtkTW4sCYDUjTEuTGkjCHuLvtZU8iM8GkhD7SpjW4AJJbBk86YU4K43sWGuW5Vmzp1lVCx4ab/kJsuBw== 167 | dependencies: 168 | tar-fs "^2.0.0" 169 | 170 | mime-db@1.43.0: 171 | version "1.43.0" 172 | resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.43.0.tgz#0a12e0502650e473d735535050e7c8f4eb4fae58" 173 | integrity sha512-+5dsGEEovYbT8UY9yD7eE4XTc4UwJ1jBYlgaQQF38ENsKR3wj/8q8RFZrF9WIZpB2V1ArTVFUva8sAul1NzRzQ== 174 | 175 | mime-types@^2.1.25: 176 | version "2.1.26" 177 | resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.26.tgz#9c921fc09b7e149a65dfdc0da4d20997200b0a06" 178 | integrity sha512-01paPWYgLrkqAyrlDorC1uDwl2p3qZT7yl806vW7DvDoxwXi46jsjFbg+WdwotBIk6/MbEhO/dh5aZ5sNj/dWQ== 179 | dependencies: 180 | mime-db "1.43.0" 181 | 182 | mime@^2.0.3: 183 | version "2.4.4" 184 | resolved "https://registry.yarnpkg.com/mime/-/mime-2.4.4.tgz#bd7b91135fc6b01cde3e9bae33d659b63d8857e5" 185 | integrity sha512-LRxmNwziLPT828z+4YkNzloCFC2YM4wrB99k+AV5ZbEyfGNWfG8SO1FUXLmLDBSo89NrJZ4DIWeLjy1CHGhMGA== 186 | 187 | minimatch@^3.0.4: 188 | version "3.1.2" 189 | resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b" 190 | integrity sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw== 191 | dependencies: 192 | brace-expansion "^1.1.7" 193 | 194 | minimist@0.0.8: 195 | version "0.0.8" 196 | resolved "https://registry.yarnpkg.com/minimist/-/minimist-0.0.8.tgz#857fcabfc3397d2625b8228262e86aa7a011b05d" 197 | integrity sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0= 198 | 199 | mkdirp@0.5.1, mkdirp@^0.5.1: 200 | version "0.5.1" 201 | resolved "https://registry.yarnpkg.com/mkdirp/-/mkdirp-0.5.1.tgz#30057438eac6cf7f8c4767f38648d6697d75c903" 202 | integrity sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM= 203 | dependencies: 204 | minimist "0.0.8" 205 | 206 | ms@2.0.0: 207 | version "2.0.0" 208 | resolved "https://registry.yarnpkg.com/ms/-/ms-2.0.0.tgz#5608aeadfc00be6c2901df5f9861788de0d597c8" 209 | integrity sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g= 210 | 211 | ms@^2.1.1: 212 | version "2.1.2" 213 | resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009" 214 | integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w== 215 | 216 | once@^1.3.0, once@^1.3.1, once@^1.4.0: 217 | version "1.4.0" 218 | resolved "https://registry.yarnpkg.com/once/-/once-1.4.0.tgz#583b1aa775961d4b113ac17d9c50baef9dd76bd1" 219 | integrity sha1-WDsap3WWHUsROsF9nFC6753Xa9E= 220 | dependencies: 221 | wrappy "1" 222 | 223 | path-is-absolute@^1.0.0: 224 | version "1.0.1" 225 | resolved "https://registry.yarnpkg.com/path-is-absolute/-/path-is-absolute-1.0.1.tgz#174b9268735534ffbc7ace6bf53a5a9e1b5c5f5f" 226 | integrity sha1-F0uSaHNVNP+8es5r9TpanhtcX18= 227 | 228 | pend@~1.2.0: 229 | version "1.2.0" 230 | resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50" 231 | integrity sha1-elfrVQpng/kRUzH89GY9XI4AelA= 232 | 233 | process-nextick-args@~2.0.0: 234 | version "2.0.1" 235 | resolved "https://registry.yarnpkg.com/process-nextick-args/-/process-nextick-args-2.0.1.tgz#7820d9b16120cc55ca9ae7792680ae7dba6d7fe2" 236 | integrity sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag== 237 | 238 | progress@^2.0.1: 239 | version "2.0.3" 240 | resolved "https://registry.yarnpkg.com/progress/-/progress-2.0.3.tgz#7e8cf8d8f5b8f239c1bc68beb4eb78567d572ef8" 241 | integrity sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA== 242 | 243 | proxy-from-env@^1.0.0: 244 | version "1.0.0" 245 | resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.0.0.tgz#33c50398f70ea7eb96d21f7b817630a55791c7ee" 246 | integrity sha1-M8UDmPcOp+uW0h97gXYwpVeRx+4= 247 | 248 | pump@^3.0.0: 249 | version "3.0.0" 250 | resolved "https://registry.yarnpkg.com/pump/-/pump-3.0.0.tgz#b4a2116815bde2f4e1ea602354e8c75565107a64" 251 | integrity sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww== 252 | dependencies: 253 | end-of-stream "^1.1.0" 254 | once "^1.3.1" 255 | 256 | puppeteer-core@^2.1.1: 257 | version "2.1.1" 258 | resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-2.1.1.tgz#e9b3fbc1237b4f66e25999832229e9db3e0b90ed" 259 | integrity sha512-n13AWriBMPYxnpbb6bnaY5YoY6rGj8vPLrz6CZF3o0qJNEwlcfJVxBzYZ0NJsQ21UbdJoijPCDrM++SUVEz7+w== 260 | dependencies: 261 | "@types/mime-types" "^2.1.0" 262 | debug "^4.1.0" 263 | extract-zip "^1.6.6" 264 | https-proxy-agent "^4.0.0" 265 | mime "^2.0.3" 266 | mime-types "^2.1.25" 267 | progress "^2.0.1" 268 | proxy-from-env "^1.0.0" 269 | rimraf "^2.6.1" 270 | ws "^6.1.0" 271 | 272 | readable-stream@^2.2.2: 273 | version "2.3.7" 274 | resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-2.3.7.tgz#1eca1cf711aef814c04f62252a36a62f6cb23b57" 275 | integrity sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw== 276 | dependencies: 277 | core-util-is "~1.0.0" 278 | inherits "~2.0.3" 279 | isarray "~1.0.0" 280 | process-nextick-args "~2.0.0" 281 | safe-buffer "~5.1.1" 282 | string_decoder "~1.1.1" 283 | util-deprecate "~1.0.1" 284 | 285 | readable-stream@^3.0.1, readable-stream@^3.1.1: 286 | version "3.6.0" 287 | resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-3.6.0.tgz#337bbda3adc0706bd3e024426a286d4b4b2c9198" 288 | integrity sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA== 289 | dependencies: 290 | inherits "^2.0.3" 291 | string_decoder "^1.1.1" 292 | util-deprecate "^1.0.1" 293 | 294 | rimraf@^2.6.1: 295 | version "2.7.1" 296 | resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-2.7.1.tgz#35797f13a7fdadc566142c29d4f07ccad483e3ec" 297 | integrity sha512-uWjbaKIK3T1OSVptzX7Nl6PvQ3qAGtKEtVRjRuazjfL3Bx5eI409VZSqgND+4UNnmzLVdPj9FqFJNPqBZFve4w== 298 | dependencies: 299 | glob "^7.1.3" 300 | 301 | safe-buffer@~5.1.0, safe-buffer@~5.1.1: 302 | version "5.1.2" 303 | resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.1.2.tgz#991ec69d296e0313747d59bdfd2b745c35f8828d" 304 | integrity sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g== 305 | 306 | safe-buffer@~5.2.0: 307 | version "5.2.1" 308 | resolved "https://registry.yarnpkg.com/safe-buffer/-/safe-buffer-5.2.1.tgz#1eaf9fa9bdb1fdd4ec75f58f9cdb4e6b7827eec6" 309 | integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ== 310 | 311 | string_decoder@^1.1.1: 312 | version "1.3.0" 313 | resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.3.0.tgz#42f114594a46cf1a8e30b0a84f56c78c3edac21e" 314 | integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA== 315 | dependencies: 316 | safe-buffer "~5.2.0" 317 | 318 | string_decoder@~1.1.1: 319 | version "1.1.1" 320 | resolved "https://registry.yarnpkg.com/string_decoder/-/string_decoder-1.1.1.tgz#9cf1611ba62685d7030ae9e4ba34149c3af03fc8" 321 | integrity sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg== 322 | dependencies: 323 | safe-buffer "~5.1.0" 324 | 325 | tar-fs@^2.0.0: 326 | version "2.0.0" 327 | resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.0.0.tgz#677700fc0c8b337a78bee3623fdc235f21d7afad" 328 | integrity sha512-vaY0obB6Om/fso8a8vakQBzwholQ7v5+uy+tF3Ozvxv1KNezmVQAiWtcNmMHFSFPqL3dJA8ha6gdtFbfX9mcxA== 329 | dependencies: 330 | chownr "^1.1.1" 331 | mkdirp "^0.5.1" 332 | pump "^3.0.0" 333 | tar-stream "^2.0.0" 334 | 335 | tar-stream@^2.0.0: 336 | version "2.1.0" 337 | resolved "https://registry.yarnpkg.com/tar-stream/-/tar-stream-2.1.0.tgz#d1aaa3661f05b38b5acc9b7020efdca5179a2cc3" 338 | integrity sha512-+DAn4Nb4+gz6WZigRzKEZl1QuJVOLtAwwF+WUxy1fJ6X63CaGaUAxJRD2KEn1OMfcbCjySTYpNC6WmfQoIEOdw== 339 | dependencies: 340 | bl "^3.0.0" 341 | end-of-stream "^1.4.1" 342 | fs-constants "^1.0.0" 343 | inherits "^2.0.3" 344 | readable-stream "^3.1.1" 345 | 346 | typedarray@^0.0.6: 347 | version "0.0.6" 348 | resolved "https://registry.yarnpkg.com/typedarray/-/typedarray-0.0.6.tgz#867ac74e3864187b1d3d47d996a78ec5c8830777" 349 | integrity sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c= 350 | 351 | util-deprecate@^1.0.1, util-deprecate@~1.0.1: 352 | version "1.0.2" 353 | resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf" 354 | integrity sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8= 355 | 356 | wrappy@1: 357 | version "1.0.2" 358 | resolved "https://registry.yarnpkg.com/wrappy/-/wrappy-1.0.2.tgz#b5243d8f3ec1aa35f1364605bc0d1036e30ab69f" 359 | integrity sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8= 360 | 361 | ws@^6.1.0: 362 | version "6.2.2" 363 | resolved "https://registry.yarnpkg.com/ws/-/ws-6.2.2.tgz#dd5cdbd57a9979916097652d78f1cc5faea0c32e" 364 | integrity sha512-zmhltoSR8u1cnDsD43TX59mzoMZsLKqUweyYBAIvTngR3shc0W6aOZylZmq/7hqyVxPdi+5Ud2QInblgyE72fw== 365 | dependencies: 366 | async-limiter "~1.0.0" 367 | 368 | yauzl@2.4.1: 369 | version "2.4.1" 370 | resolved "https://registry.yarnpkg.com/yauzl/-/yauzl-2.4.1.tgz#9528f442dab1b2284e58b4379bb194e22e0c4005" 371 | integrity sha1-lSj0QtqxsihOWLQ3m7GU4i4MQAU= 372 | dependencies: 373 | fd-slicer "~1.0.1" 374 | -------------------------------------------------------------------------------- /src/send-pdf-to-textract/lambda.ts: -------------------------------------------------------------------------------- 1 | import * as AWS from "aws-sdk"; 2 | const textract = new AWS.Textract({ region: process.env.CDK_DEFAULT_REGION }); 3 | const SNS_TOPIC_ARN = process.env.SNS_TOPIC_ARN || ""; 4 | const SNS_ROLE_ARN = process.env.SNS_ROLE_ARN || ""; 5 | const S3_BUCKET_NAME = process.env.S3_BUCKET_NAME || ""; 6 | 7 | // TOOD - replace `any` with correct event type 8 | export const handler = async (event: any = {}): Promise => { 9 | // Logs starting message + event 10 | console.log("send-pdf-to-textract -> start"); 11 | console.log(JSON.stringify(event, null, 4)); 12 | 13 | // Pulls filename from event 14 | const filename = event["Records"][0]["s3"]["object"]["key"]; 15 | 16 | // Short-circuit if filename isn't defined 17 | if (!filename) { 18 | console.log("ERROR - no filename found in S3 event"); 19 | return; 20 | } 21 | 22 | // Logs filename 23 | console.log("filename: " + filename); 24 | 25 | // Defines params for Textract API call 26 | const params: AWS.Textract.StartDocumentAnalysisRequest = { 27 | DocumentLocation: { 28 | S3Object: { 29 | Bucket: S3_BUCKET_NAME, 30 | Name: filename, 31 | }, 32 | }, 33 | FeatureTypes: ["FORMS"], 34 | NotificationChannel: { 35 | RoleArn: SNS_ROLE_ARN, 36 | SNSTopicArn: SNS_TOPIC_ARN, 37 | }, 38 | }; 39 | 40 | // Log startDocumentAnalysis param 41 | console.log("startDocumentAnalysis params"); 42 | console.log(params); 43 | 44 | // Invoke Textract.startDocumentAnalysis 45 | await new Promise((resolve) => { 46 | return textract.startDocumentAnalysis(params, function (err, data) { 47 | // Logs error state 48 | console.log("startDocumentAnalysis - err"); 49 | console.log(err); 50 | 51 | // Logs success state 52 | console.log("startDocumentAnalysis - data"); 53 | console.log(data); 54 | 55 | // Resolves with data 56 | resolve(data); 57 | }); 58 | }); 59 | 60 | // Logs shutdown message 61 | console.log("send-pdf-to-textract -> shutdown"); 62 | return; 63 | }; 64 | -------------------------------------------------------------------------------- /src/send-textract-result-to-dynamo/lambda.ts: -------------------------------------------------------------------------------- 1 | import * as AWS from "aws-sdk"; 2 | const db = new AWS.DynamoDB.DocumentClient(); 3 | const textract = new AWS.Textract({ region: process.env.CDK_DEFAULT_REGION }); 4 | const TABLE_NAME = process.env.TABLE_NAME || ""; 5 | const PRIMARY_KEY = process.env.PRIMARY_KEY || ""; 6 | 7 | // // // // 8 | 9 | // DOC: https://docs.aws.amazon.com/textract/latest/dg/examples-extract-kvp.html 10 | // DOC: https://docs.aws.amazon.com/textract/latest/dg/examples-export-table-csv.html 11 | function find_value_block(key_block: any, value_map: any) { 12 | let value_block = ""; 13 | key_block["Relationships"].forEach((relationship: any) => { 14 | if (relationship["Type"] == "VALUE") { 15 | relationship["Ids"].forEach((value_id: any) => { 16 | value_block = value_map[value_id]; 17 | }); 18 | } 19 | }); 20 | return value_block; 21 | } 22 | 23 | // // // // 24 | 25 | function get_text(result: any, blocks_map: any) { 26 | let text = ""; 27 | let word; 28 | if (result["Relationships"]) { 29 | result["Relationships"].forEach((relationship: any) => { 30 | if (relationship["Type"] === "CHILD") { 31 | relationship["Ids"].forEach((child_id: any) => { 32 | word = blocks_map[child_id]; 33 | 34 | if (word["BlockType"] == "WORD") { 35 | text += word["Text"] + " "; 36 | } 37 | if (word["BlockType"] == "SELECTION_ELEMENT") { 38 | if (word["SelectionStatus"] == "SELECTED") { 39 | text += "X "; 40 | } 41 | } 42 | }); 43 | } 44 | }); 45 | } 46 | return text; 47 | } 48 | 49 | // // // // 50 | 51 | function getKvMap(resp: any) { 52 | // get key and value maps 53 | let key_map: any = {}; 54 | let value_map: any = {}; 55 | let block_map: any = {}; 56 | 57 | resp["Blocks"].forEach((block: any) => { 58 | const block_id = block["Id"]; 59 | block_map[block_id] = block; 60 | if (block["BlockType"] == "KEY_VALUE_SET") { 61 | if (block["EntityTypes"].includes("KEY")) { 62 | key_map[block_id] = block; 63 | } else { 64 | value_map[block_id] = block; 65 | } 66 | } 67 | }); 68 | 69 | return [key_map, value_map, block_map]; 70 | } 71 | 72 | // // // // 73 | 74 | function getKvRelationship(keyMap: any, valueMap: any, blockMap: any) { 75 | let kvs: any = {}; 76 | // for block_id, key_block in key_map.items(): 77 | Object.keys(keyMap).forEach((blockId) => { 78 | const keyBlock = keyMap[blockId]; 79 | const value_block = find_value_block(keyBlock, valueMap); 80 | // console.log("value_block"); 81 | 82 | // Gets Key + Value 83 | const key = get_text(keyBlock, blockMap); 84 | const val = get_text(value_block, blockMap); 85 | kvs[key] = val; 86 | }); 87 | 88 | return kvs; 89 | } 90 | 91 | // // // // 92 | 93 | /** 94 | * handler 95 | * Trims down result from Textract and sends to DynamoDB 96 | * @param event - AWS SNS event 97 | * Example `event` parameter: 98 | * { 99 | * "Records": [ 100 | * { 101 | * "EventSource": "aws:sns", 102 | * "EventVersion": "1.0", 103 | * "EventSubscriptionArn": "arn:aws:sns:us-east-1:839811712080:LambdaCronExample-MyTopic86869434-GNU4OYHJJK2B:2835b150-7b7c-4701-b345-1a26aa997ba0", 104 | * "Sns": { 105 | * "Type": "Notification", 106 | * "MessageId": "a0fcdb52-33c5-5e75-a29d-8d9f16c6efa0", 107 | * "TopicArn": "arn:aws:sns:us-east-1:839811712080:LambdaCronExample-MyTopic86869434-GNU4OYHJJK2B", 108 | * "Subject": null, 109 | * "Message": "{\"JobId\":\"8ace6713ef0f85fbd88294d4f50b5063ad08052f93da760e98da55668f3e1148\",\"Status\":\"SUCCEEDED\",\"API\":\"StartDocumentTextDetection\",\"Timestamp\":1582506691353,\"DocumentLocation\":{\"S3ObjectName\":\"5055255.pdf\",\"S3Bucket\":\"lambdacronexample-cogccpdfdownloadsbucket93b40e01-1kn95iu6zt174\"}}", 110 | * "Timestamp": "2020-02-24T01:11:31.395Z", 111 | * "SignatureVersion": "1", 112 | * "Signature": "drLfHmCEegFSc4oLYO/5y8ouKkHQLEsDo2l9tFFFtUGTUcbnIhFHYvQfbTND9BxE8a18kZ+nDBHuLlNhF67oVW0B2I8oy3svlYc6oeRUcgg6wF8TqlPpBwsG+UCnP81OIjtcb0VutqeYonlg8EDuXYK/pPumDsQ1NIkKjfwncdLPJLsgiuZZOkkRnvui5qftLSRkXtI1EXdwhIIXNyU3jK0MhEWZ/69K2mpZRSkb1jy2nkfQi1zlhktF4AfQpq4bMVxaBTq36Hb4FXXpzcPO2CLN2XchAIszd4vDAiEy9oSKJIW0IxqY5bazk70/lCva+AaMHoUAUazHHXamOZC4nw==", 113 | * "SigningCertUrl": "https://sns.us-east-1.amazonaws.com/SimpleNotificationService-a86cb10b4e1f29c941702d737128f7b6.pem", 114 | * "UnsubscribeUrl": "https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:839811712080:LambdaCronExample-MyTopic86869434-GNU4OYHJJK2B:2835b150-7b7c-4701-b345-1a26aa997ba0", 115 | * "MessageAttributes": {} 116 | * } 117 | * } 118 | * ] 119 | * } 120 | */ 121 | export const handler = async (event: any = {}): Promise => { 122 | // Logs starting message 123 | console.log("send-textract-result-to-dynamo - start"); 124 | console.log(JSON.stringify(event, null, 4)); 125 | 126 | // Defines variable to store JobId from Textract.analyzeDocument 127 | let JobId = ""; 128 | 129 | // Attempt to parse the JobId from the `event` param 130 | try { 131 | JobId = event["Records"][0]["Sns"]["Message"]; 132 | console.log("parsed jobid struct from event"); 133 | console.log(JobId); 134 | console.log(JSON.parse(JobId)); 135 | const jobIDStruct = JSON.parse(JobId); 136 | JobId = jobIDStruct["JobId"]; 137 | } catch (e) { 138 | // Logs error message from 139 | console.log("Error parsing JobId from SNS message"); 140 | console.log(e); 141 | return; 142 | } 143 | 144 | // Log JobID 145 | console.log("JobId"); 146 | console.log(JobId); 147 | 148 | // Defines params for Textract.getDocumentAnalysis 149 | const getDocumentAnalysisParams: AWS.Textract.Types.GetDocumentAnalysisRequest = 150 | { 151 | JobId, 152 | }; 153 | 154 | // Logs getDocumentAnalysis params 155 | console.log("getDocumentAnalysisParams"); 156 | console.log(getDocumentAnalysisParams); 157 | 158 | // Fires off textract.getDocumentAnalysis 159 | await new Promise((resolve) => { 160 | textract.getDocumentAnalysis( 161 | getDocumentAnalysisParams, 162 | async function (err: any, data: any) { 163 | // Logs error response 164 | console.log("Textract - getDocumentAnalysis error"); 165 | console.log(err); 166 | 167 | // Logs successful response 168 | console.log("Textract - getDocumentAnalysis data"); 169 | console.log(data); 170 | 171 | // Gets KV mapping 172 | const [keyMap, valueMap, blockMap] = getKvMap(data); 173 | 174 | // Get Key Value relationship 175 | const kvPairs = getKvRelationship(keyMap, valueMap, blockMap); 176 | 177 | // Logs form key-value pairs from Textract response 178 | console.log("Got KV pairs"); 179 | 180 | // Sanitize KV pairs 181 | const sanitizedKvPairs: { [key: string]: string } = {}; 182 | 183 | // Iterate over each key in kvPairs 184 | Object.keys(kvPairs).forEach((key: string) => { 185 | // Sanitizes the key from kv pairs 186 | // DynamoDB key cannot contain any whitespace 187 | const sanitizedKey: string = key 188 | .toLowerCase() 189 | .trim() 190 | .replace(/\s/g, "_") 191 | .replace(":", ""); 192 | 193 | // Pulls value from kbPairs, trims whitespace 194 | const value: string = kvPairs[key].trim(); 195 | 196 | // Assigns value from kvPairs to sanitizedKey 197 | if (value !== "") { 198 | sanitizedKvPairs[sanitizedKey] = kvPairs[key]; 199 | } 200 | }); 201 | 202 | // Logs sanitized key-value pairs 203 | console.log("SanitizedKvPairs"); 204 | console.log(sanitizedKvPairs); 205 | 206 | // Defines the item we're inserting into the database 207 | const item: any = { 208 | [PRIMARY_KEY]: JobId, 209 | data: sanitizedKvPairs, 210 | }; 211 | 212 | // Defines the params for db.put 213 | const putItemInput: AWS.DynamoDB.DocumentClient.PutItemInput = { 214 | TableName: TABLE_NAME, 215 | Item: item, 216 | }; 217 | 218 | // Logs DynamoDB params 219 | console.log("putItemInput"); 220 | console.log(putItemInput); 221 | 222 | // Inserts the record into the DynamoDB table 223 | await db.put(putItemInput).promise(); 224 | 225 | // Logs shutdown message 226 | console.log("send-textract-result-to-dynamo - shutdown"); 227 | 228 | // Resolves promise 229 | resolve(true); 230 | } 231 | ); 232 | }); 233 | }; 234 | -------------------------------------------------------------------------------- /src/stack.ts: -------------------------------------------------------------------------------- 1 | import * as s3 from "aws-cdk-lib/aws-s3"; 2 | import * as events from "aws-cdk-lib/aws-events"; 3 | import * as dynamodb from "aws-cdk-lib/aws-dynamodb"; 4 | import * as targets from "aws-cdk-lib/aws-events-targets"; 5 | import * as lambda from "aws-cdk-lib/aws-lambda"; 6 | import * as iam from "aws-cdk-lib/aws-iam"; 7 | import * as sns from "aws-cdk-lib/aws-sns"; 8 | import { Duration, RemovalPolicy, Stack } from "aws-cdk-lib"; 9 | import { 10 | DynamoEventSource, 11 | S3EventSource, 12 | SnsEventSource, 13 | } from "aws-cdk-lib/aws-lambda-event-sources"; 14 | import { Construct } from "constructs"; 15 | 16 | // // // // 17 | 18 | export class PdfTextractPipeline extends Stack { 19 | constructor(scope: Construct, id: string) { 20 | super(scope, id); 21 | 22 | // Provisions SNS topic for Textract asynchronous AnalyzeDocument process 23 | const snsTopic = new sns.Topic(this, "TextractTopic"); 24 | 25 | // Provisions IAM Role for Textract Service 26 | const textractServiceRole = new iam.Role(this, "TextractServiceRole", { 27 | assumedBy: new iam.ServicePrincipal("textract.amazonaws.com"), 28 | }); 29 | 30 | // Provisions PolicyStatement for textractServiceRole 31 | // NOTE - addActions and addResources should have more fine-grained policy settings 32 | const policyStatement = new iam.PolicyStatement(); 33 | policyStatement.addActions("*"); 34 | policyStatement.addResources("*"); 35 | textractServiceRole.addToPolicy(policyStatement); 36 | 37 | // Grant the textractServiceRole the ability to publish to snsTopic 38 | snsTopic.grantPublish(textractServiceRole); 39 | 40 | // Provisions S3 bucket for downloaded PDFs 41 | // Doc: https://docs.aws.amazon.com/cdk/api/latest/docs/aws-s3-readme.html#logging-configuration 42 | const downloadsBucket: s3.Bucket = new s3.Bucket( 43 | this, 44 | "cogcc_pdf_downloads_bucket" 45 | ); 46 | 47 | // // // // 48 | // Provisions DynamoDB tables 49 | // The default removal policy is RETAIN, which means that cdk destroy will not attempt to delete 50 | // the new table, and it will remain in your account until manually deleted. By setting the policy to 51 | // DESTROY, cdk destroy will delete the table (even if it has data in it) 52 | 53 | // Defines pdfUrlsTable for PDF Download URLs 54 | const pdfUrlsTable = new dynamodb.Table(this, "cogcc-pdf-urls", { 55 | partitionKey: { 56 | name: "itemId", 57 | type: dynamodb.AttributeType.STRING, 58 | }, 59 | stream: dynamodb.StreamViewType.NEW_IMAGE, 60 | tableName: "cogcc-pdf-urls", 61 | removalPolicy: RemovalPolicy.DESTROY, // NOTE - This removalPolicy is NOT recommended for production code 62 | }); 63 | 64 | // Defines DyanmoDB table for parsed PDF data 65 | const parsedPdfDataTable = new dynamodb.Table(this, "cogcc-pdf-data", { 66 | partitionKey: { 67 | name: "itemId", 68 | type: dynamodb.AttributeType.STRING, 69 | }, 70 | stream: dynamodb.StreamViewType.NEW_IMAGE, 71 | tableName: "cogcc-pdf-data", 72 | removalPolicy: RemovalPolicy.DESTROY, // NOTE - This removalPolicy is NOT recommended for production code 73 | }); 74 | 75 | // // // // 76 | // Provisions send-pdf-to-textract lambda 77 | 78 | // sendPdfToTextract Lambda 79 | const sendPdfToTextract = new lambda.Function( 80 | this, 81 | "sendPdfToTextractFunction", 82 | { 83 | code: new lambda.AssetCode("src/send-pdf-to-textract"), 84 | handler: "lambda.handler", 85 | runtime: lambda.Runtime.NODEJS_12_X, 86 | environment: { 87 | TABLE_NAME: parsedPdfDataTable.tableName, 88 | PRIMARY_KEY: "itemId", 89 | S3_BUCKET_NAME: downloadsBucket.bucketName, 90 | SNS_TOPIC_ARN: snsTopic.topicArn, 91 | SNS_ROLE_ARN: textractServiceRole.roleArn, 92 | }, 93 | } 94 | ); 95 | 96 | // Configure event source so the `sendPdfToTextract` is run each time a file is downloaded to S3 97 | // Doc: https://docs.aws.amazon.com/cdk/api/latest/docs/aws-lambda-event-sources-readme.html#s3 98 | sendPdfToTextract.addEventSource( 99 | new S3EventSource(downloadsBucket, { 100 | events: [s3.EventType.OBJECT_CREATED], 101 | }) 102 | ); 103 | 104 | // Add "textract:*" actions to sendPdfToTextract lambda 105 | sendPdfToTextract.addToRolePolicy( 106 | new iam.PolicyStatement({ 107 | actions: ["textract:*"], 108 | resources: ["*"], 109 | }) 110 | ); 111 | 112 | // Adds permissions for the sendPdfToTextract read/write from parsedPdfDataTable + downloadsBucket 113 | snsTopic.grantPublish(sendPdfToTextract); 114 | parsedPdfDataTable.grantReadWriteData(sendPdfToTextract); 115 | downloadsBucket.grantReadWrite(sendPdfToTextract); 116 | 117 | // // // // 118 | // Provisions send-textract-result-to-dynamo lambda 119 | 120 | // sendTextractResultToDynamo Lambda 121 | const sendTextractResultToDynamo = new lambda.Function( 122 | this, 123 | "sendTextractResultToDynamo", 124 | { 125 | code: new lambda.AssetCode( 126 | "src/send-textract-result-to-dynamo" 127 | ), 128 | handler: "lambda.handler", 129 | runtime: lambda.Runtime.NODEJS_12_X, 130 | timeout: Duration.seconds(300), 131 | environment: { 132 | TABLE_NAME: parsedPdfDataTable.tableName, 133 | PRIMARY_KEY: "itemId", 134 | SNS_TOPIC_ARN: snsTopic.topicArn, 135 | SNS_ROLE_ARN: textractServiceRole.roleArn, 136 | S3_BUCKET_NAME: downloadsBucket.bucketName, 137 | }, 138 | } 139 | ); 140 | 141 | // Adds permissions for the sendTextractResultToDynamo read/write from parsedPdfDataTable + downloadsBucket 142 | parsedPdfDataTable.grantReadWriteData(sendTextractResultToDynamo); 143 | downloadsBucket.grantReadWrite(sendTextractResultToDynamo); 144 | sendTextractResultToDynamo.addEventSource(new SnsEventSource(snsTopic)); 145 | sendTextractResultToDynamo.addToRolePolicy(policyStatement); 146 | 147 | // // // // 148 | // Provisions download-pdf-to-s3 lambda 149 | 150 | // Lambda to download files and insert them into S3 151 | const downloadPdfToS3Lambda = new lambda.Function( 152 | this, 153 | "downloadPdfToS3Lambda", 154 | { 155 | code: new lambda.AssetCode("src/download-pdf-to-s3"), 156 | handler: "lambda.handler", 157 | runtime: lambda.Runtime.NODEJS_12_X, 158 | environment: { 159 | TABLE_NAME: pdfUrlsTable.tableName, 160 | S3_BUCKET_NAME: downloadsBucket.bucketName, 161 | PRIMARY_KEY: "itemId", 162 | }, 163 | } 164 | ); 165 | 166 | // Adds permissions for the lambdaFn to read/write from pdfUrlsTable + downloadsBucket 167 | pdfUrlsTable.grantReadWriteData(downloadPdfToS3Lambda); 168 | downloadsBucket.grantReadWrite(downloadPdfToS3Lambda); 169 | 170 | // Add DynamoDB stream event source to downloadPdfToS3Lambda 171 | // Invoked once-per-document 172 | downloadPdfToS3Lambda.addEventSource( 173 | new DynamoEventSource(pdfUrlsTable, { 174 | startingPosition: lambda.StartingPosition.TRIM_HORIZON, 175 | batchSize: 1, 176 | }) 177 | ); 178 | 179 | // // // // 180 | // Provisions scrape-pdfs-from-website lambda 181 | // NOTE - we bump the memory to 1024mb here to accommodate the memory requirements for Puppeteer 182 | 183 | // DownloadURL Crawler Lambda 184 | const scrapePdfsFromWebsiteLambda = new lambda.Function( 185 | this, 186 | "scrapePdfsFromWebsiteLambda", 187 | { 188 | code: new lambda.AssetCode("src/scrape-pdfs-from-website"), 189 | handler: "lambda.handler", 190 | runtime: lambda.Runtime.NODEJS_12_X, 191 | timeout: Duration.seconds(300), 192 | memorySize: 1024, 193 | environment: { 194 | TABLE_NAME: pdfUrlsTable.tableName, 195 | PRIMARY_KEY: "itemId", 196 | }, 197 | } 198 | ); 199 | 200 | // Adds permissions for the scrapePdfsFromWebsiteLambda to read/write from pdfUrlsTable 201 | pdfUrlsTable.grantReadWriteData(scrapePdfsFromWebsiteLambda); 202 | 203 | // Run `scrape-pdfs-from-website` every 12 hours 204 | // See https://docs.aws.amazon.com/lambda/latest/dg/tutorial-scheduled-events-schedule-expressions.html 205 | const rule = new events.Rule(this, "Rule", { 206 | schedule: events.Schedule.expression("rate(720 minutes)"), 207 | }); 208 | 209 | // Adds scrapePdfsFromWebsiteLambda as target for scheduled rule 210 | rule.addTarget(new targets.LambdaFunction(scrapePdfsFromWebsiteLambda)); 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2018", 4 | "module": "commonjs", 5 | "lib": ["es2016", "dom", "es2017.object", "es2017.string"], 6 | "strict": true, 7 | "noImplicitAny": true, 8 | "strictNullChecks": true, 9 | "noImplicitThis": true, 10 | "alwaysStrict": true, 11 | "noUnusedLocals": true, 12 | "noUnusedParameters": true, 13 | "noImplicitReturns": true, 14 | "noFallthroughCasesInSwitch": false, 15 | "inlineSourceMap": true, 16 | "inlineSources": true, 17 | "experimentalDecorators": true, 18 | "strictPropertyInitialization": false 19 | } 20 | } 21 | --------------------------------------------------------------------------------