├── resources ├── s3.yml └── vpc.yml ├── tsconfig.json ├── src ├── lib │ ├── awsSecret.ts │ ├── logger.ts │ └── queryFilter.ts ├── @types │ └── awslambda │ │ └── index.d.ts └── functions │ ├── queryS3Express.ts │ ├── query.ts │ └── streamingQuery.ts ├── LICENSE ├── package.json ├── .gitignore ├── README.md └── serverless.yml /resources/s3.yml: -------------------------------------------------------------------------------- 1 | Resources: 2 | S3OneZoneExpressBucket: 3 | Type: AWS::S3Express::DirectoryBucket 4 | Properties: 5 | # See https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-s3express-directorybucket.html#cfn-s3express-directorybucket-bucketname 6 | BucketName: '${self:custom.s3.bucketName}' 7 | DataRedundancy: SingleAvailabilityZone 8 | LocationName: ${self:custom.s3.availabilityZoneId} 9 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "declaration": false, 4 | "target": "es2021", 5 | "strict": true, 6 | "preserveConstEnums": true, 7 | "noEmit": true, 8 | "sourceMap": false, 9 | "module":"es2022", 10 | "moduleResolution":"node", 11 | "esModuleInterop": true, 12 | "skipLibCheck": true, 13 | "forceConsistentCasingInFileNames": true, 14 | "isolatedModules": true, 15 | }, 16 | "exclude": ["node_modules"], 17 | "include": ["**/*/*.ts"] 18 | } 19 | -------------------------------------------------------------------------------- /src/lib/awsSecret.ts: -------------------------------------------------------------------------------- 1 | const { 2 | AWS_S3_ONE_ZONE_EXPRESS_ENDPOINT, 3 | AWS_REGION, 4 | AWS_ACCESS_KEY_ID, 5 | AWS_SECRET_ACCESS_KEY, 6 | AWS_SESSION_TOKEN, 7 | } = process.env; 8 | 9 | // Get the credentials via AWS SDK credential chain, in the case of AWS Lambda from the environment variables 10 | // See https://duckdb.org/docs/extensions/httpfs/s3api#configuration-and-authentication 11 | export const getAWSSecretQuery = (): string => { 12 | return `CREATE SECRET aws (TYPE S3, KEY_ID '${AWS_ACCESS_KEY_ID}', SECRET '${AWS_SECRET_ACCESS_KEY}', SESSION_TOKEN '${AWS_SESSION_TOKEN}', REGION '${AWS_REGION}', ENDPOINT '${AWS_S3_ONE_ZONE_EXPRESS_ENDPOINT}')`; 13 | } 14 | -------------------------------------------------------------------------------- /src/@types/awslambda/index.d.ts: -------------------------------------------------------------------------------- 1 | import { APIGatewayProxyEventV2, Context, Handler } from "aws-lambda"; 2 | import { Writable } from 'stream'; 3 | 4 | type Headers = { 5 | [header: string]: string | number; 6 | } 7 | 8 | type Metadata = { 9 | statusCode: number; 10 | headers: Headers; 11 | } 12 | 13 | global{ 14 | declare namespace awslambda { 15 | export namespace HttpResponseStream { 16 | function from(writable: Writable, metadata: Metadata): Writable; 17 | } 18 | 19 | export type StreamifyHandler = (event: APIGatewayProxyEventV2, responseStream: Writable, context: Context) => Promise; 20 | 21 | export function streamifyResponse(handler: StreamifyHandler) : Handler; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/lib/logger.ts: -------------------------------------------------------------------------------- 1 | import bunyan, { LogLevelString } from 'bunyan'; 2 | 3 | type LoggerOptions = { 4 | level?: LogLevelString, 5 | name?: string 6 | } 7 | 8 | export default class Logger { 9 | private level: LogLevelString; 10 | private name: string; 11 | private loggerInstance: bunyan | undefined; 12 | 13 | // See https://github.com/DefinitelyTyped/DefinitelyTyped/blob/master/types/bunyan/index.d.ts#L196 14 | constructor (options: LoggerOptions | undefined) { 15 | this.level = options?.level || process.env.LOG_LEVEL as LogLevelString || 'info' as LogLevelString; 16 | this.name = options?.name || `duckdb-lambda-logger`; 17 | } 18 | 19 | public getInstance() { 20 | if (!this.loggerInstance) { 21 | this.loggerInstance = bunyan.createLogger({ 22 | name: this.name, 23 | level: this.level, 24 | }); 25 | } 26 | return this.loggerInstance; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/lib/queryFilter.ts: -------------------------------------------------------------------------------- 1 | export const filterQuery = (query: string | undefined, isRemoteQuery: boolean = true): string => { 2 | if (query && isRemoteQuery && query.toLowerCase().indexOf('duckdb_settings') > -1) { 3 | return `select 'Function is disabled' as error`; 4 | } else if (query && isRemoteQuery && query.trim().toLowerCase().startsWith('install')) { 5 | return `select 'Extension installation disabled' as error`; 6 | } if (query && isRemoteQuery && query.trim().toLowerCase().startsWith('load')) { 7 | return `select 'Extension loading is disabled' as error`; 8 | } if (query && isRemoteQuery && query.toLowerCase().indexOf('set') > -1) { 9 | return `select 'Using SET is disabled' as error`; 10 | } if (query && isRemoteQuery && query.toLowerCase().indexOf('pragma') > -1) { 11 | return `select 'Using PRAGMA is disabled' as error`; 12 | } else { 13 | return query || ''; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Tobi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "serverless-duckdb", 3 | "version": "0.1.0", 4 | "description": "Running DuckDB on AWS Lambda", 5 | "scripts": { 6 | "cfn-lint": "cfn-lint .serverless/cloudformation-template-update-stack.json", 7 | "package": "sls package", 8 | "qa": "npm run package && npm run cfn-lint", 9 | "deploy": "sls deploy" 10 | }, 11 | "repository": { 12 | "type": "git", 13 | "url": "git@github.com:tobilg/serverless-duckdb.git" 14 | }, 15 | "author": "TobiLG ", 16 | "license": "MIT", 17 | "bugs": { 18 | "url": "https://github.com/tobilg/serverless-duckdb/issues" 19 | }, 20 | "homepage": "https://github.com/tobilg/serverless-duckdb#readme", 21 | "devDependencies": { 22 | "@types/aws-lambda": "^8.10.147", 23 | "@types/bunyan": "^1.8.11", 24 | "serverless": "^3.37.0", 25 | "serverless-esbuild": "^1.54.6", 26 | "serverless-iam-roles-per-function": "next", 27 | "serverless-prune-plugin": "^2.1.0", 28 | "typescript": "^5.7.3" 29 | }, 30 | "dependencies": { 31 | "aws-embedded-metrics": "^4.2.0", 32 | "aws-lambda": "^1.0.7", 33 | "aws-sdk": "^2.1692.0", 34 | "bunyan": "^1.8.15", 35 | "duckdb": "^1.1.3" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | 106 | .webpack 107 | -------------------------------------------------------------------------------- /resources/vpc.yml: -------------------------------------------------------------------------------- 1 | # See: 2 | # * https://www.infoq.com/articles/aws-vpc-cloudformation/ 3 | # * https://www.infoq.com/articles/aws-vpc-cloudformation-part2/ 4 | # * https://templates.cloudonaut.io/en/stable/vpc/ 5 | 6 | Resources: 7 | 8 | VPC: 9 | Type: AWS::EC2::VPC 10 | Properties: 11 | CidrBlock: 10.0.0.0/20 12 | EnableDnsSupport: True 13 | EnableDnsHostnames: True 14 | InstanceTenancy: default 15 | 16 | InternetGateway: 17 | Type: AWS::EC2::InternetGateway 18 | 19 | GatewayAttachment: 20 | Type: AWS::EC2::VPCGatewayAttachment 21 | Properties: 22 | VpcId: !Ref VPC 23 | InternetGatewayId: !Ref InternetGateway 24 | 25 | PrivateASubnet: 26 | Type: AWS::EC2::Subnet 27 | Properties: 28 | AvailabilityZoneId: ${self:custom.s3.availabilityZoneId} 29 | CidrBlock: 10.0.0.0/23 30 | VpcId: !Ref VPC 31 | Tags: 32 | - Key: Name 33 | Value: 'Private Subnet A' 34 | - Key: Reach 35 | Value: private 36 | 37 | PrivateARouteTable: 38 | Type: AWS::EC2::RouteTable 39 | Properties: 40 | VpcId: !Ref VPC 41 | Tags: 42 | - Key: Name 43 | Value: 'Private A' 44 | 45 | PrivateASubnetRouteTableAssociation: 46 | Type: AWS::EC2::SubnetRouteTableAssociation 47 | Properties: 48 | RouteTableId: !Ref PrivateARouteTable 49 | SubnetId: !Ref PrivateASubnet 50 | 51 | NetworkAclPrivate: 52 | Type: 'AWS::EC2::NetworkAcl' 53 | Properties: 54 | VpcId: !Ref VPC 55 | Tags: 56 | - Key: Name 57 | Value: Private 58 | 59 | PrivateASubnetNetworkAclAssociation: 60 | Type: AWS::EC2::SubnetNetworkAclAssociation 61 | Properties: 62 | NetworkAclId: !Ref NetworkAclPrivate 63 | SubnetId: !Ref PrivateASubnet 64 | 65 | NetworkAclEntryInPrivateAllowAll: 66 | Type: 'AWS::EC2::NetworkAclEntry' 67 | Properties: 68 | NetworkAclId: !Ref NetworkAclPrivate 69 | RuleNumber: 99 70 | Protocol: -1 71 | RuleAction: allow 72 | Egress: false 73 | CidrBlock: '0.0.0.0/0' 74 | 75 | NetworkAclEntryOutPrivateAllowAll: 76 | Type: 'AWS::EC2::NetworkAclEntry' 77 | Properties: 78 | NetworkAclId: !Ref NetworkAclPrivate 79 | RuleNumber: 99 80 | Protocol: -1 81 | RuleAction: allow 82 | Egress: true 83 | CidrBlock: '0.0.0.0/0' 84 | 85 | # See https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-ec2-vpcendpoint.html 86 | S3VPCEndpoint: 87 | Type: AWS::EC2::VPCEndpoint 88 | Properties: 89 | ServiceName: 'com.amazonaws.${self:provider.region}.s3express' 90 | RouteTableIds: 91 | - !Ref PrivateARouteTable 92 | VpcEndpointType: Gateway 93 | VpcId: !Ref VPC 94 | 95 | VpcEndpointSecurityGroup: 96 | Type: 'AWS::EC2::SecurityGroup' 97 | Properties: 98 | VpcId: !Ref VPC 99 | GroupDescription: 'Security group for VPC Endpoints' 100 | SecurityGroupIngress: 101 | - IpProtocol: tcp 102 | FromPort: 443 103 | ToPort: 443 104 | SourceSecurityGroupId: !GetAtt VpcEndpointLambdaSecurityGroup.GroupId 105 | 106 | VpcEndpointLambdaSecurityGroup: 107 | Type: 'AWS::EC2::SecurityGroup' 108 | Properties: 109 | VpcId: !Ref VPC 110 | GroupDescription: 'Security group for VPC Endpoint Lambda' 111 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # serverless-duckdb 2 | An example of how to run DuckDB on AWS Lambda & API Gateway. This will eventually deploy three Lambda functions: 3 | 4 | * An **API Gateway endpoint** to which DuckDB queries can be issued via a POST request, which is authenticated by an API Key 5 | * An **API Gateway endpoint** to which DuckDB queries can be issued via a POST request, providing access to S3 Express One Zone, and authenticated by an API Key 6 | * A **Function URL Lambda** that supports streaming the query results as an Apache Arrow IPC stream, which uses **NO** authentication by default (you can add `AWS_IAM` auth manually if you wish) 7 | 8 | Only the first function is deployed by default, to deploy the others, you need to uncomment the specific sections in the [serverless.yml](serverless.yml) file. 9 | 10 | ## Requirements 11 | You'll need a current v3 version installation of the [Serverless Framework](https://serverless.com) on the machine you're planning to deploy the application from. 12 | 13 | Also, you'll have to setup your AWS credentials according to the [Serverless docs](https://www.serverless.com/framework/docs/providers/aws/guide/credentials/). 14 | 15 | ## Configuration 16 | DuckDB is automatically configured to use the [HTTPFS extension](https://duckdb.org/docs/extensions/httpfs), and uses the AWS credentials that are given to your Lambda function by its execution role. This means you can potentially query data that is available via HTTP(S) or in AWS S3 buckets. 17 | 18 | If you want to also query data (e.g. Parquet files) that resides in one or more S3 buckets, you'll have to adjust the `iamRoleStatements` part of the function configuration in the [serverless.yml](serverless.yml#L45) file. Just replace the `YOUR-S3-BUCKET-NAME` with your actual S3 bucket name. 19 | 20 | ## Deployment 21 | After you cloned this repository to your local machine and cd'ed in its directory, the application can be deployed like this (don't forget a `npm i` to install the dependencies!): 22 | 23 | ```bash 24 | $ sls deploy 25 | ``` 26 | 27 | This will deploy the stack to the default AWS region `us-east-1`. In case you want to deploy the stack to a different region, you can specify a `--region` argument: 28 | 29 | ```bash 30 | $ sls deploy --region eu-central-1 31 | ``` 32 | 33 | The deployment should take 2-3 minutes. Once the deployment is finished, you should find some output in your console that indicates the API Gateway endpoint URL and the API Key: 34 | 35 | ```yaml 36 | api keys: 37 | DuckDBKey: REDACTED 38 | endpoints: 39 | POST - https://REDACTED.execute-api.us-east-1.amazonaws.com/prd/v1/query 40 | streamingQuery: https://REDACTED.lambda-url.us-east-1.on.aws/ 41 | ``` 42 | 43 | ## Usage 44 | 45 | ### API Gateway endpoint 46 | You can now query your DuckDB endpoint via HTTP requests (don't forget to exchange `REDACTED` with your real URL and API Key), e.g. 47 | 48 | ```bash 49 | curl -L -XPOST 'https://REDACTED.execute-api.us-east-1.amazonaws.com/prd/v1/query' \ 50 | --header 'x-api-key: REDACTED' \ 51 | --header 'Content-Type: application/json' \ 52 | --data-raw '{ 53 | "query": "SELECT avg(c_acctbal) FROM '\''https://shell.duckdb.org/data/tpch/0_01/parquet/customer.parquet'\'';" 54 | }' 55 | ``` 56 | 57 | ### API Gateway endpoint with S3 Express One Zone 58 | You can now query your DuckDB endpoint via HTTP requests (don't forget to exchange `REDACTED` with your real URL and API Key), e.g. 59 | 60 | ```bash 61 | curl -L -XPOST 'https://REDACTED.execute-api.us-east-1.amazonaws.com/prd/v1/queryS3Express' \ 62 | --header 'x-api-key: REDACTED' \ 63 | --header 'Content-Type: application/json' \ 64 | --data-raw '{ 65 | "query": "SELECT avg(c_acctbal) FROM '\''https://shell.duckdb.org/data/tpch/0_01/parquet/customer.parquet'\'';" 66 | }' 67 | ``` 68 | 69 | ### Function URL Lambda 70 | You can query the streaming Lambda by issueing the following command (don't forget to specify an `--output` path, this is where the Apache Arrow file will be stored): 71 | 72 | ```bash 73 | curl -L -XPOST 'https://REDACTED.lambda-url.us-east-1.on.aws/' \ 74 | --header 'Content-Type: application/json' \ 75 | --data-raw 'SELECT 1' \ 76 | --output /tmp/result.arrow 77 | ``` -------------------------------------------------------------------------------- /src/functions/queryS3Express.ts: -------------------------------------------------------------------------------- 1 | import { APIGatewayEvent, Context } from 'aws-lambda'; 2 | import DuckDB from 'duckdb'; 3 | import { metricScope, Unit } from 'aws-embedded-metrics'; 4 | import Logger from '../lib/logger'; 5 | import { filterQuery } from '../lib/queryFilter'; 6 | import { getAWSSecretQuery } from '../lib/awsSecret'; 7 | 8 | // Patch BigInt 9 | (BigInt.prototype as any).toJSON = function() { 10 | return this.toString() 11 | } 12 | 13 | // Instantiate logger 14 | const logger = new Logger({ 15 | name: 'duckdb-sync-logger', 16 | }).getInstance(); 17 | 18 | // Instantiate DuckDB 19 | const duckDB = new DuckDB.Database(':memory:', { allow_unsigned_extensions: 'true' }); 20 | 21 | // Create connection 22 | const connection = duckDB.connect(); 23 | 24 | // Store initialization 25 | let isInitialized = false; 26 | 27 | // Promisify query method 28 | const query = (query: string, isRemoteQuery: boolean = true) => { 29 | return new Promise((resolve, reject) => { 30 | connection.all(filterQuery(query, isRemoteQuery), (err, res) => { 31 | if (err) reject(err); 32 | resolve(res); 33 | }) 34 | }) 35 | } 36 | 37 | // SIGTERM Handler 38 | process.on('SIGTERM', async () => { 39 | logger.debug('[runtime] SIGTERM received'); 40 | logger.debug('[runtime] cleaning up'); 41 | 42 | // Add your cleanup code here! 43 | 44 | logger.debug('[runtime] exiting'); 45 | process.exit(0) 46 | }); 47 | 48 | // eslint-disable-next-line import/prefer-default-export 49 | export const handler = metricScope(metrics => async (event: APIGatewayEvent, context: Context) => { 50 | // Setup logger 51 | const requestLogger = logger.child({ requestId: context.awsRequestId }); 52 | requestLogger.debug({ event, context }); 53 | 54 | // Setup metrics 55 | metrics.putDimensions({ Service: 'QueryService' }); 56 | metrics.setProperty('RequestId', context.awsRequestId); 57 | 58 | try { 59 | if (!event.body) { 60 | throw 'No body present!'; 61 | } else { 62 | // Parse event body with query 63 | const body = JSON.parse(event.body); 64 | 65 | if (!body.hasOwnProperty('query')) { 66 | throw 'Missing query property in request body!'; 67 | } 68 | 69 | // Check if DuckDB has been initalized 70 | if (!isInitialized) { 71 | const initialSetupStartTimestamp = new Date().getTime(); 72 | 73 | // Load home directory 74 | await query(`SET home_directory='/tmp';`, false); 75 | 76 | // Install and load local extensions 77 | await query(`INSTALL '/opt/nodejs/node_modules/duckdb/extensions/aws.duckdb_extension';`, false); 78 | await query(`LOAD '/opt/nodejs/node_modules/duckdb/extensions/aws.duckdb_extension';`, false); 79 | await query(`INSTALL '/opt/nodejs/node_modules/duckdb/extensions/httpfs.duckdb_extension';`, false); 80 | await query(`LOAD '/opt/nodejs/node_modules/duckdb/extensions/httpfs.duckdb_extension';`, false); 81 | await query(`INSTALL '/opt/nodejs/node_modules/duckdb/extensions/arrow.duckdb_extension';`, false); 82 | await query(`LOAD '/opt/nodejs/node_modules/duckdb/extensions/arrow.duckdb_extension';`, false); 83 | 84 | // Whether or not the global http metadata is used to cache HTTP metadata, see https://github.com/duckdb/duckdb/pull/5405 85 | await query(`SET enable_http_metadata_cache=true;`, false); 86 | // Whether or not object cache is used to cache e.g. Parquet metadata 87 | await query(`SET enable_object_cache=true;`, false); 88 | // Disable local filesystem 89 | await query(`SET disabled_filesystems = 'LocalFileSystem';`, false); 90 | // Enable lock configuration 91 | await query(`SET lock_configuration = true;`, false); 92 | 93 | requestLogger.debug({ message: 'Initial setup done!' }); 94 | metrics.putMetric('InitialSetupDuration', (new Date().getTime() - initialSetupStartTimestamp), Unit.Milliseconds); 95 | 96 | const awsSetupStartTimestamp = new Date().getTime(); 97 | 98 | // Set AWS credentials, endpoint and region 99 | await query(getAWSSecretQuery(), false); 100 | 101 | requestLogger.debug({ message: 'AWS setup done!' }); 102 | metrics.putMetric('AWSSetupDuration', (new Date().getTime() - awsSetupStartTimestamp), Unit.Milliseconds); 103 | 104 | // Store initialization 105 | isInitialized = true; 106 | } 107 | 108 | // Track query start timestamp 109 | const queryStartTimestamp = new Date().getTime(); 110 | 111 | // Run query 112 | const queryResult = await query(body.query); 113 | requestLogger.debug({ queryResult }); 114 | 115 | metrics.putMetric('QueryDuration', (new Date().getTime() - queryStartTimestamp), Unit.Milliseconds); 116 | 117 | return { 118 | statusCode: 200, 119 | body: JSON.stringify(queryResult), 120 | } 121 | } 122 | } catch (err: any) { 123 | requestLogger.error(err); 124 | return { 125 | statusCode: 400, 126 | body: JSON.stringify({ 127 | error: (err.message ? err.message : 'Unknown error encountered'), 128 | }), 129 | } 130 | } 131 | }) 132 | -------------------------------------------------------------------------------- /src/functions/query.ts: -------------------------------------------------------------------------------- 1 | import { APIGatewayEvent, Context } from 'aws-lambda'; 2 | import DuckDB from 'duckdb'; 3 | import { metricScope, Unit } from 'aws-embedded-metrics'; 4 | import Logger from '../lib/logger'; 5 | import { filterQuery } from '../lib/queryFilter'; 6 | 7 | // Patch BigInt 8 | (BigInt.prototype as any).toJSON = function() { 9 | return this.toString() 10 | } 11 | 12 | // Instantiate logger 13 | const logger = new Logger({ 14 | name: 'duckdb-sync-logger', 15 | }).getInstance(); 16 | 17 | // Instantiate DuckDB 18 | const duckDB = new DuckDB.Database(':memory:', { allow_unsigned_extensions: 'true' }); 19 | 20 | // Create connection 21 | const connection = duckDB.connect(); 22 | 23 | // Store initialization 24 | let isInitialized = false; 25 | 26 | // Promisify query method 27 | const query = (query: string, isRemoteQuery: boolean = true) => { 28 | return new Promise((resolve, reject) => { 29 | connection.all(filterQuery(query, isRemoteQuery), (err, res) => { 30 | if (err) reject(err); 31 | resolve(res); 32 | }) 33 | }) 34 | } 35 | 36 | // SIGTERM Handler 37 | process.on('SIGTERM', async () => { 38 | logger.debug('[runtime] SIGTERM received'); 39 | logger.debug('[runtime] cleaning up'); 40 | 41 | // Add your cleanup code here! 42 | 43 | logger.debug('[runtime] exiting'); 44 | process.exit(0) 45 | }); 46 | 47 | // eslint-disable-next-line import/prefer-default-export 48 | export const handler = metricScope(metrics => async (event: APIGatewayEvent, context: Context) => { 49 | // Setup logger 50 | const requestLogger = logger.child({ requestId: context.awsRequestId }); 51 | requestLogger.debug({ event, context }); 52 | 53 | // Setup metrics 54 | metrics.putDimensions({ Service: 'QueryService' }); 55 | metrics.setProperty('RequestId', context.awsRequestId); 56 | 57 | try { 58 | if (!event.body) { 59 | throw 'No body present!'; 60 | } else { 61 | // Parse event body with query 62 | const body = JSON.parse(event.body); 63 | 64 | if (!body.hasOwnProperty('query')) { 65 | throw 'Missing query property in request body!'; 66 | } 67 | 68 | // Check if DuckDB has been initalized 69 | if (!isInitialized) { 70 | const initialSetupStartTimestamp = new Date().getTime(); 71 | 72 | // Load home directory 73 | await query(`SET home_directory='/tmp';`, false); 74 | 75 | // Install and load httpfs extension 76 | await query(`INSTALL httpfs;`, false); 77 | await query(`LOAD httpfs;`, false); 78 | 79 | // Enable loading of Lambda extensions from https://extensions.quacking.cloud (see website for list of extensions) 80 | await query(`SET custom_extension_repository = 'https://extensions.quacking.cloud';`, false); 81 | // Whether or not the global http metadata is used to cache HTTP metadata, see https://github.com/duckdb/duckdb/pull/5405 82 | await query(`SET enable_http_metadata_cache=true;`, false); 83 | // Whether or not object cache is used to cache e.g. Parquet metadata 84 | await query(`SET enable_object_cache=true;`, false); 85 | // Disable local filesystem 86 | await query(`SET disabled_filesystems = 'LocalFileSystem';`, false); 87 | // Enable lock configuration 88 | await query(`SET lock_configuration = true;`, false); 89 | 90 | requestLogger.debug({ message: 'Initial setup done!' }); 91 | metrics.putMetric('InitialSetupDuration', (new Date().getTime() - initialSetupStartTimestamp), Unit.Milliseconds); 92 | 93 | const awsSetupStartTimestamp = new Date().getTime(); 94 | 95 | // Set AWS credentials 96 | // See https://docs.aws.amazon.com/lambda/latest/dg/configuration-envvars.html#configuration-envvars-runtime 97 | // await query(`SET s3_region='${process.env.AWS_REGION}';`, false); 98 | // await query(`SET s3_access_key_id='${process.env.AWS_ACCESS_KEY_ID}';`, false); 99 | // await query(`SET s3_secret_access_key='${process.env.AWS_SECRET_ACCESS_KEY}';`, false); 100 | // await query(`SET s3_session_token='${process.env.AWS_SESSION_TOKEN}';`, false); 101 | 102 | requestLogger.debug({ message: 'AWS setup done!' }); 103 | metrics.putMetric('AWSSetupDuration', (new Date().getTime() - awsSetupStartTimestamp), Unit.Milliseconds); 104 | 105 | // Store initialization 106 | isInitialized = true; 107 | } 108 | 109 | // Track query start timestamp 110 | const queryStartTimestamp = new Date().getTime(); 111 | 112 | // Run query 113 | const queryResult = await query(body.query); 114 | requestLogger.debug({ queryResult }); 115 | 116 | metrics.putMetric('QueryDuration', (new Date().getTime() - queryStartTimestamp), Unit.Milliseconds); 117 | 118 | return { 119 | statusCode: 200, 120 | body: JSON.stringify(queryResult), 121 | } 122 | } 123 | } catch (err: any) { 124 | requestLogger.error(err); 125 | return { 126 | statusCode: 400, 127 | body: JSON.stringify({ 128 | error: (err.message ? err.message : 'Unknown error encountered'), 129 | }), 130 | } 131 | } 132 | }) 133 | -------------------------------------------------------------------------------- /serverless.yml: -------------------------------------------------------------------------------- 1 | service: serverless-duckdb 2 | 3 | frameworkVersion: '3' 4 | 5 | plugins: 6 | - serverless-iam-roles-per-function 7 | - serverless-prune-plugin 8 | - serverless-esbuild 9 | 10 | custom: 11 | 12 | # API details 13 | api: 14 | version: 'v1' 15 | 16 | # Availability Zone info 17 | # See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-express-networking.html#s3-express-endpoints 18 | # us-east-1 / az4 19 | # Uncomment if you want to use S3 Express One Zone 20 | # s3: 21 | # availabilityZoneId: 'use1-az4' 22 | # bucketName: 'serverless-duckdb--${self:custom.s3.availabilityZoneId}--x-s3' 23 | 24 | # esbuild plugin 25 | esbuild: 26 | bundle: true 27 | minify: false 28 | exclude: 29 | - 'duckdb' 30 | - 'aws-lambda' 31 | - 'dtrace-provider' 32 | 33 | # Prune plugin 34 | prune: 35 | automatic: true 36 | number: 3 37 | 38 | provider: 39 | name: aws 40 | runtime: nodejs20.x 41 | region: ${opt:region, 'us-east-1'} 42 | stage: 'prd' 43 | logRetentionInDays: 7 44 | apiGateway: 45 | apiKeys: 46 | - DuckDBApiKey 47 | environment: 48 | AWS_NODEJS_CONNECTION_REUSE_ENABLED: '1' # Enable HTTP keep-alive connections for the AWS SDK 49 | # See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-express-networking.html#s3-express-endpoints 50 | # Uncomment if you want to use S3 Express One Zone 51 | # AWS_S3_ONE_ZONE_EXPRESS_ENDPOINT: 's3express-${self:custom.s3.availabilityZoneId}.${self:provider.region}.amazonaws.com' 52 | STAGE: '${self:provider.stage}' 53 | LOG_LEVEL: 'debug' 54 | 55 | functions: 56 | 57 | # Streaming Lambda function: Will return results as Apache Arrow IPC stream 58 | streamingQuery: 59 | handler: src/functions/streamingQuery.handler 60 | memorySize: 2048 61 | timeout: 30 62 | url: 63 | invokeMode: RESPONSE_STREAM 64 | layers: 65 | # Latest x86 layer 66 | - 'arn:aws:lambda:${self:provider.region}:041475135427:layer:duckdb-nodejs-x86:18' 67 | 68 | # Request-Response Lambda function: Will return results as JSON en bloc 69 | query: 70 | handler: src/functions/query.handler 71 | memorySize: 2048 72 | timeout: 30 73 | # Enable this for arm64 support 74 | # architecture: arm64 75 | # Enable this for custom IAM roles for S3 access 76 | # iamRoleStatements: 77 | # # Read from input bucket 78 | # - Effect: Allow 79 | # Action: 80 | # - s3:GetObject 81 | # Resource: 'arn:aws:s3:::YOUR-S3-INPUT-BUCKET-NAME/*' 82 | # - Effect: Allow 83 | # Action: 84 | # - s3:ListBucket 85 | # Resource: 'arn:aws:s3:::YOUR-S3-INPUT-BUCKET-NAME' 86 | # # If you want to write to another output bucket, use the statements below 87 | # # (or use the same bucket name as the input bucket if you want to write to it as well) 88 | # - Effect: Allow 89 | # Action: 90 | # - s3:ListBucket 91 | # - s3:ListBucketMultipartUploads 92 | # Resource: 'arn:aws:s3:::YOUR-S3-OUTPUT-BUCKET-NAME' 93 | # - Effect: Allow 94 | # Action: 95 | # - s3:GetObject 96 | # - s3:PutObject 97 | # - s3:AbortMultipartUpload 98 | # - s3:ListMultipartUploadParts 99 | # Resource: 'arn:aws:s3:::YOUR-S3-OUTPUT-BUCKET-NAME/*' 100 | layers: 101 | # Latest x86_64 layer 102 | - 'arn:aws:lambda:${self:provider.region}:041475135427:layer:duckdb-nodejs-x86:18' 103 | # Latest arm64 layer 104 | # - 'arn:aws:lambda:${self:provider.region}:041475135427:layer:duckdb-nodejs-arm64:16' 105 | events: 106 | - http: 107 | path: ${self:custom.api.version}/query 108 | method: post 109 | cors: true 110 | private: true 111 | 112 | # Uncomment if you want to use S3 Express One Zone 113 | # queryS3Express: 114 | # handler: src/functions/queryS3Express.handler 115 | # # Max memory and thus vCPUs 116 | # memorySize: 10240 117 | # # Max timeout for API Gateway REST APIs 118 | # timeout: 30 119 | # # Enable this for custom IAM roles for S3 access 120 | # iamRoleStatements: 121 | # # Read from S3 Express One Zone bucket 122 | # # See https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-express-security-iam.html#s3-express-security-iam-authorization 123 | # - Effect: Allow 124 | # Action: 125 | # - s3express:CreateSession 126 | # Resource: !Join [':', [ 'arn:aws:s3express', '${self:provider.region}', !Ref 'AWS::AccountId', 'bucket/${self:custom.s3.bucketName}' ]] 127 | # layers: 128 | # # Latest x86_64 extensions layer (includes aws, arrow, httpfs) 129 | # - 'arn:aws:lambda:${self:provider.region}:041475135427:layer:duckdb-extensions-nodejs-x86:6' 130 | # vpc: 131 | # securityGroupIds: 132 | # - !GetAtt VpcEndpointLambdaSecurityGroup.GroupId 133 | # subnetIds: 134 | # - !GetAtt PrivateASubnet.SubnetId 135 | # events: 136 | # - http: 137 | # path: ${self:custom.api.version}/queryS3Express 138 | # method: post 139 | # cors: true 140 | # private: true 141 | 142 | resources: 143 | # Uncomment if you want to use S3 Express One Zone 144 | # # VPC 145 | # - ${file(resources/vpc.yml)} 146 | # # S3 One Zone Express 147 | # - ${file(resources/s3.yml)} 148 | 149 | package: 150 | individually: true 151 | -------------------------------------------------------------------------------- /src/functions/streamingQuery.ts: -------------------------------------------------------------------------------- 1 | import { APIGatewayProxyEventV2, Context } from 'aws-lambda'; 2 | import { Writable, pipeline } from 'stream'; 3 | import { promisify } from 'util'; 4 | import DuckDB from 'duckdb'; 5 | import Logger from '../lib/logger'; 6 | import { Metadata } from '../@types/awslambda'; 7 | import { filterQuery } from '../lib/queryFilter'; 8 | 9 | // Create pipeline stream 10 | const Pipeline = promisify(pipeline); 11 | 12 | // Instantiate logger 13 | const logger = new Logger({ 14 | name: 'duckdb-streaming-logger', 15 | }).getInstance(); 16 | 17 | // Instantiate DuckDB 18 | const duckDB = new DuckDB.Database(':memory:', { allow_unsigned_extensions: 'true' }); 19 | 20 | // Create connection 21 | const connection = duckDB.connect(); 22 | 23 | // Store initialization 24 | let isInitialized = false; 25 | 26 | // Promisify query method 27 | const query = (query: string, isRemoteQuery: boolean = true) => { 28 | return new Promise((resolve, reject) => { 29 | connection.all(filterQuery(query, isRemoteQuery), (err, res) => { 30 | if (err) reject(err); 31 | resolve(res); 32 | }) 33 | }) 34 | } 35 | 36 | // SIGTERM Handler 37 | process.on('SIGTERM', async () => { 38 | logger.debug('[runtime] SIGTERM received'); 39 | logger.debug('[runtime] cleaning up'); 40 | 41 | // Add your cleanup code here! 42 | 43 | logger.debug('[runtime] exiting'); 44 | process.exit(0) 45 | }); 46 | 47 | // eslint-disable-next-line import/prefer-default-export 48 | exports.handler = awslambda.streamifyResponse(async ( 49 | event: APIGatewayProxyEventV2, 50 | responseStream: Writable, 51 | context: Context 52 | ): Promise => { 53 | // Setup logger 54 | const requestLogger = logger.child({ requestId: context!.awsRequestId }); 55 | requestLogger.debug({ event, context }); 56 | 57 | // Create default metadata for HTTP status code and headers 58 | const metadata: Metadata = { 59 | statusCode: 200, 60 | headers: { 61 | 'Access-Control-Allow-Origin': '*', 62 | 'Access-Control-Allow-Headers': '*', 63 | 'Access-Control-Expose-Headers': '*', 64 | 'Access-Control-Max-Age': 0, 65 | 'Access-Control-Allow-Methods': '*', 66 | } 67 | }; 68 | 69 | try { 70 | if (event.requestContext.http.method === 'OPTIONS') { 71 | // Set content type header 72 | metadata.headers['Content-Type'] = 'text/plain'; 73 | // Use global helper to pass metadata and status code 74 | responseStream = awslambda.HttpResponseStream.from(responseStream, metadata); 75 | // Need to write something, otherwiese metadata is not shown -> CORS error! 76 | responseStream.write('OK'); 77 | responseStream.end(); 78 | } else if (event.requestContext.http.method === 'POST') { 79 | // Parse event body with query 80 | const body = event.body?.replace(/;/g, ''); 81 | requestLogger.debug({ body }); 82 | 83 | // Check if DuckDB has been initalized 84 | if (!isInitialized) { 85 | // Load home directory 86 | await query(`SET home_directory='/tmp';`, false); 87 | 88 | // Install and load httpfs extension 89 | await query(`INSTALL httpfs;`, false); 90 | await query(`LOAD httpfs;`, false); 91 | 92 | // Enable loading of Lambda extensions from https://extensions.quacking.cloud (see website for list of extensions) 93 | await query(`SET custom_extension_repository = 'https://extensions.quacking.cloud';`, false); 94 | 95 | // Install the Apache Arrow extension 96 | await query(`INSTALL arrow;`, false); 97 | await query(`LOAD arrow;`, false); 98 | 99 | // Whether or not the global http metadata is used to cache HTTP metadata, see https://github.com/duckdb/duckdb/pull/5405 100 | await query(`SET enable_http_metadata_cache=true;`, false); 101 | // Whether or not object cache is used to cache e.g. Parquet metadata 102 | await query(`SET enable_object_cache=true;`, false); 103 | // Disable local filesystem 104 | await query(`SET disabled_filesystems = 'LocalFileSystem';`, false); 105 | // Enable lock configuration 106 | await query(`SET lock_configuration = true;`, false); 107 | 108 | requestLogger.debug({ message: 'Initial setup done!' }); 109 | 110 | // Set AWS credentials 111 | // See https://docs.aws.amazon.com/lambda/latest/dg/configuration-envvars.html#configuration-envvars-runtime 112 | // await query(`SET s3_region='${process.env.AWS_REGION}';`, false); 113 | // await query(`SET s3_access_key_id='${process.env.AWS_ACCESS_KEY_ID}';`, false); 114 | // await query(`SET s3_secret_access_key='${process.env.AWS_SECRET_ACCESS_KEY}';`, false); 115 | // await query(`SET s3_session_token='${process.env.AWS_SESSION_TOKEN}';`, false); 116 | 117 | requestLogger.debug({ message: 'AWS setup done!' }); 118 | 119 | // Store initialization 120 | isInitialized = true; 121 | } 122 | 123 | // Set Content-Type header 124 | metadata.headers['Content-Type'] = 'application/octet-stream'; 125 | 126 | // Use global helper to pass metadata and status code 127 | responseStream = awslambda.HttpResponseStream.from(responseStream, metadata); 128 | 129 | // Pipeline the Arrow IPC stream to the response stream 130 | await Pipeline(await connection.arrowIPCStream(filterQuery(body)), responseStream); 131 | 132 | // Close response stream 133 | responseStream.end(); 134 | } else { // Invalid request method 135 | metadata.statusCode = 400; 136 | metadata.headers['Content-Type'] = 'text/plain'; 137 | responseStream = awslambda.HttpResponseStream.from(responseStream, metadata); 138 | responseStream.write('ERROR'); 139 | responseStream.end(); 140 | } 141 | } catch (e: any) { 142 | logger.error(e.message); 143 | metadata.statusCode = 500; 144 | metadata.headers['Content-Type'] = 'text/plain'; 145 | responseStream = awslambda.HttpResponseStream.from(responseStream, metadata); 146 | responseStream.write(e.message); 147 | responseStream.end(); 148 | } 149 | }); 150 | --------------------------------------------------------------------------------