├── .gitignore ├── LICENSE ├── README.md ├── package-lock.json ├── package.json ├── serverless.yml └── src ├── repartitionData.js └── utils └── logger.js /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | 106 | .serverless 107 | .esbuild 108 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Tobi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # serverless-parquet-repartitioner 2 | A AWS Lambda function for repartitioning parquet files in S3 via DuckDB queries. 3 | 4 | ## Requirements 5 | You'll need a current v3 version installation of the [Serverless Framework](https://serverless.com) on the machine you're planning to deploy the application from. 6 | 7 | Also, you'll have to setup your AWS credentials according to the [Serverless docs](https://www.serverless.com/framework/docs/providers/aws/guide/credentials/). 8 | 9 | ### Install dependencies 10 | After cloning the repo, you'll need to install the dependencies via 11 | 12 | ```bash 13 | $ npm i 14 | ``` 15 | 16 | ## Configuration 17 | You can customize the configuration of the stack by setting some configuration values. Open up the [serverless.yml](serverless.yml) file, and search for `TODO` in your IDE. This will point you to the places you need to update according to your needs. 18 | 19 | ### Mandatory configuration settings 20 | 21 | * [S3 bucket name](serverless.yml#L18): You need to use the S3 bucket where your data that you want to repartition resides (e.g. `my-source-bucket`) 22 | * [Custom repartitioning query](serverless.yml#L77): You can write flexible repartitioning queries in the DuckDB syntax. Have a look at the examples at the [httpfs extension docs](https://duckdb.org/docs/extensions/httpfs). You **need** to update this, as the template uses only example values! 23 | 24 | ### Optional configuration settings 25 | 26 | * [S3 region](serverless.yml#L79): The AWS region your S3 bucket is deployed to (if different from the region the Lambda funciton is deployed to) 27 | * [The schedule](serverless.yml#L84): The actual schedule on why the Lambda function is run. Have a look at the [Serverless Framework docs](https://www.serverless.com/framework/docs/providers/aws/events/schedule) to find out what the potential settings are. 28 | * [DuckDB memory limit](serverless.yml#L48): The memory limit is influenced by the function memory setting (automatically) 29 | * [DuckDB threads count](serverless.yml#L75): Optionally set the max thread limit (on Lambda, this is set automatically by the amount of memory the functions has assigned), but with this setting you can influence how many files are writte per partition. If you set a lower thread count than available, this means that the computation will not use all available resources for the sake of being able to set the number of generated files! Ideally, rather align the amount of memory you assign to the Lambda function. 30 | * [Lambda timeout](serverless.yml#L50): The maximum time a Lambda function can run is currently 15min / 900sec. This means that if your query takes longer than that, it will be terminated by the underlying Firecracker engine. 31 | 32 | ### Using different source/target S3 buckets 33 | If you're planning to use different S3 buckets as source and target for the data repartitioning, you need to adapt the `iamRoleStatements` settings of the function. 34 | 35 | Here's an example with minimal privileges: 36 | 37 | ```yaml 38 | iamRoleStatements: 39 | # Source S3 bucket permissions 40 | - Effect: Allow 41 | Action: 42 | - s3:ListBucket 43 | Resource: 'arn:aws:s3:::my-source-bucket' 44 | - Effect: Allow 45 | Action: 46 | - s3:GetObject 47 | Resource: 'arn:aws:s3:::my-source-bucket/*' 48 | # Target S3 bucket permissions 49 | - Effect: Allow 50 | Action: 51 | - s3:ListBucket 52 | - s3:ListBucketMultipartUploads 53 | Resource: 'arn:aws:s3:::my-target-bucket' 54 | - Effect: Allow 55 | Action: 56 | - s3:PutObject 57 | - s3:AbortMultipartUpload 58 | - s3:ListMultipartUploadParts 59 | Resource: 'arn:aws:s3:::my-target-bucket/*' 60 | ``` 61 | 62 | A query for this use case would look like this: 63 | 64 | ```sql 65 | COPY (SELECT * FROM parquet_scan('s3://my-source-bucket/input/*.parquet', HIVE_PARTITIONING = 1)) TO 's3://my-starget-bucket/output' (FORMAT PARQUET, PARTITION_BY (column1, column2, column3), ALLOW_OVERWRITE TRUE); 66 | ``` 67 | 68 | ## Deployment 69 | After you cloned this repository to your local machine and cd'ed in its directory, the application can be deployed like this (don't forget a `npm i` to install the dependencies!): 70 | 71 | ```bash 72 | $ sls deploy 73 | ``` 74 | 75 | This will deploy the stack to the default AWS region `us-east-1`. In case you want to deploy the stack to a different region, you can specify a `--region` argument: 76 | 77 | ```bash 78 | $ sls deploy --region eu-central-1 79 | ``` 80 | 81 | The deployment should take 2-3 minutes. 82 | 83 | ## Checks and manual triggering 84 | You can [manually invoke](https://www.serverless.com/framework/docs/providers/aws/cli-reference/invoke) the deployed Lambda function by running 85 | 86 | ```bash 87 | $ sls invoke -f repartitionData 88 | ``` 89 | 90 | After that, you can [check the generated CloudWatch logs](https://www.serverless.com/framework/docs/providers/aws/cli-reference/logs) by issueing 91 | 92 | ```bash 93 | $ sls logs -f repartitionData 94 | ``` 95 | 96 | If you don't see any `DUCKDB_NODEJS_ERROR` in the logs, everything ran successfully, and you can have a look at your S3 bucket for the newly generated parquet files. 97 | 98 | ## Costs 99 | Using this repository will generate costs in your AWS account. Please refer to the AWS pricing docs before deploying and running it. 100 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "parquet-repartitioner", 3 | "version": "0.1.0", 4 | "description": "Host your own website statistics analytics on AWS", 5 | "scripts": { 6 | "cfn-lint": "cfn-lint .serverless/cloudformation-template-update-stack.json", 7 | "package": "sls package", 8 | "test": "jest", 9 | "qa": "npm run package && npm run cfn-lint" 10 | }, 11 | "repository": { 12 | "type": "git", 13 | "url": "git@github.com:tobilg/parquet-repartitioner.git" 14 | }, 15 | "author": { 16 | "name": "TobiLG", 17 | "email": "tobilg@gmail.com", 18 | "url": "https://github.com/tobilg" 19 | }, 20 | "license": "MIT", 21 | "bugs": { 22 | "url": "https://github.com/tobilg/parquet-repartitioner/issues" 23 | }, 24 | "homepage": "https://github.com/tobilg/parquet-repartitioner#readme", 25 | "devDependencies": { 26 | "esbuild": "^0.24.0", 27 | "serverless-esbuild": "^1.54.3", 28 | "serverless-iam-roles-per-function": "next", 29 | "serverless-prune-plugin": "^2.0.2" 30 | }, 31 | "dependencies": { 32 | "aws-embedded-metrics": "^4.2.0", 33 | "bunyan": "^1.8.15" 34 | }, 35 | "jest": { 36 | "transform": { 37 | "^.+\\.js?$": "esbuild-jest" 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /serverless.yml: -------------------------------------------------------------------------------- 1 | service: parquet-repartitioner 2 | 3 | frameworkVersion: '3' 4 | 5 | plugins: 6 | - serverless-iam-roles-per-function 7 | - serverless-prune-plugin 8 | - serverless-esbuild 9 | 10 | custom: 11 | 12 | s3: 13 | # Hint: Make sure the bucket is in the region as the Lambda function, 14 | # or you need to manually overwrite the region via the CUSTOM_AWS_REGION env var 15 | # See https://aws.amazon.com/s3/faqs/ 16 | 17 | # TODO: Change to real bucket name 18 | bucketName: 'my-source-bucket' 19 | 20 | # esbuild settings 21 | esbuild: 22 | bundle: true 23 | minify: false 24 | exclude: 25 | - duckdb 26 | 27 | # Prune plugin 28 | prune: 29 | automatic: true 30 | number: 3 31 | 32 | provider: 33 | name: aws 34 | runtime: nodejs18.x 35 | region: ${opt:region, 'us-east-1'} 36 | stage: ${opt:stage, 'prd'} 37 | logRetentionInDays: 7 38 | environment: 39 | AWS_NODEJS_CONNECTION_REUSE_ENABLED: '1' # Enable HTTP keep-alive connections for the AWS SDK 40 | STAGE: '${self:provider.stage}' 41 | LOG_LEVEL: 'debug' 42 | 43 | functions: 44 | 45 | repartitionData: 46 | handler: src/repartitionData.handler 47 | # TODO: Optionally configure the Lmbda memory size 48 | memorySize: 10240 49 | # TODO: Optionally set the Lambda timeout (900sec / 15min is the maximum) 50 | timeout: 900 51 | iamRoleStatements: 52 | # Must have list permission 53 | - Effect: Allow 54 | Action: 55 | - s3:ListBucket 56 | - s3:ListBucketMultipartUploads 57 | Resource: 'arn:aws:s3:::${self:custom.s3.bucketName}' 58 | # For multipart upload see 59 | # https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpuoverview.html#mpuAndPermissions 60 | - Effect: Allow 61 | Action: 62 | - s3:GetObject 63 | - s3:PutObject 64 | - s3:AbortMultipartUpload 65 | - s3:ListMultipartUploadParts 66 | Resource: 'arn:aws:s3:::${self:custom.s3.bucketName}/*' 67 | layers: 68 | # Use the public DuckDB layer from https://github.com/tobilg/duckdb-nodejs-layer 69 | - 'arn:aws:lambda:${self:provider.region}:041475135427:layer:duckdb-nodejs-x86:16' 70 | environment: 71 | DUCKDB_MEMORY_LIMIT: ${self:functions.repartitionData.memorySize} 72 | # TODO: Optionally set the max thread limit (on Lambda, this is set automatically by the amount of memory the functions has assigned), 73 | # but with this setting you can influence how many files are writte per partition. If you set a lower thread count than available, 74 | # this means that the computation will not use all available resources! 75 | # DUCKDB_THREADS: 2 # Example 76 | # TODO: Write your repartitioning query below 77 | REPARTITION_QUERY: COPY (SELECT * FROM parquet_scan('s3://${self:custom.s3.bucketName}/input/*.parquet', HIVE_PARTITIONING = 1)) TO 's3://${self:custom.s3.bucketName}/output' (FORMAT PARQUET, PARTITION_BY (column1, column2, column3), ALLOW_OVERWRITE TRUE); 78 | # TODO: If you want to query a S3 bucket in another region than the Lambda function is deployed to 79 | # CUSTOM_AWS_REGION: 'eu-central-1' # Example 80 | events: 81 | - schedule: 82 | # TODO: Change schedule here if necessary 83 | # See https://www.serverless.com/framework/docs/providers/aws/events/schedule for details 84 | rate: rate(24 hours) 85 | 86 | package: 87 | individually: true 88 | -------------------------------------------------------------------------------- /src/repartitionData.js: -------------------------------------------------------------------------------- 1 | import DuckDB from 'duckdb'; 2 | import { metricScope, Unit } from 'aws-embedded-metrics'; 3 | import Logger from './utils/logger'; 4 | 5 | // Instantiate logger 6 | const logger = new Logger(); 7 | 8 | // Instantiate DuckDB 9 | const duckDB = new DuckDB.Database(':memory:'); 10 | 11 | // Create connection 12 | const connection = duckDB.connect(); 13 | 14 | // Store initialization 15 | let isInitialized = false; 16 | 17 | // Store AWS region 18 | let region; 19 | 20 | // Promisify query method 21 | const query = (query) => { 22 | return new Promise((resolve, reject) => { 23 | connection.all(query, (err, res) => { 24 | if (err) reject(err); 25 | resolve(res); 26 | }) 27 | }) 28 | } 29 | 30 | const { 31 | AWS_ACCESS_KEY_ID, 32 | AWS_SECRET_ACCESS_KEY, 33 | AWS_SESSION_TOKEN, 34 | AWS_REGION, 35 | DUCKDB_MEMORY_LIMIT, 36 | DUCKDB_THREADS, 37 | CUSTOM_AWS_REGION, 38 | REPARTITION_QUERY, 39 | } = process.env; 40 | 41 | // eslint-disable-next-line import/prefer-default-export 42 | export const handler = metricScope(metrics => async (event, context) => { 43 | // Setup logger 44 | const requestLogger = logger.child({ requestId: context.awsRequestId }); 45 | requestLogger.debug({ event, context }); 46 | 47 | // Setup metrics 48 | metrics.putDimensions({ Service: 'QueryService' }); 49 | metrics.setProperty('RequestId', context.awsRequestId); 50 | 51 | // Assign AWS region for query 52 | if (CUSTOM_AWS_REGION) { 53 | region = CUSTOM_AWS_REGION; 54 | } else { 55 | region = AWS_REGION; 56 | } 57 | 58 | try { 59 | // Check if DuckDB has been initalized 60 | if (!isInitialized) { 61 | const initialSetupStartTimestamp = new Date().getTime(); 62 | 63 | // Set home directory 64 | await query(`SET home_directory='/tmp';`); 65 | // Load httpsfs 66 | await query(`INSTALL httpfs;`); 67 | await query(`LOAD httpfs;`); 68 | // New speedup option, see https://github.com/duckdb/duckdb/pull/5405 69 | await query(`SET enable_http_metadata_cache=true;`); 70 | // Set memory limit 71 | await query(`SET memory_limit='${parseInt((DUCKDB_MEMORY_LIMIT/1024).toFixed(0))}GB';`); 72 | // Set thread count 73 | if (DUCKDB_THREADS && DUCKDB_THREADS >= 1 && DUCKDB_THREADS <= 6) { 74 | await query(`SET threads TO ${DUCKDB_THREADS};`); 75 | } 76 | 77 | requestLogger.debug({ message: 'Initial setup done!' }); 78 | metrics.putMetric('InitialSetupDuration', (new Date().getTime() - initialSetupStartTimestamp), Unit.Milliseconds); 79 | 80 | const awsSetupStartTimestamp = new Date().getTime(); 81 | 82 | // Set AWS credentials 83 | // See https://docs.aws.amazon.com/lambda/latest/dg/configuration-envvars.html#configuration-envvars-runtime 84 | await query(`SET s3_region='${region}';`); 85 | await query(`SET s3_access_key_id='${AWS_ACCESS_KEY_ID}';`); 86 | await query(`SET s3_secret_access_key='${AWS_SECRET_ACCESS_KEY}';`); 87 | await query(`SET s3_session_token='${AWS_SESSION_TOKEN}';`); 88 | 89 | requestLogger.debug({ message: 'AWS setup done!' }); 90 | metrics.putMetric('AWSSetupDuration', (new Date().getTime() - awsSetupStartTimestamp), Unit.Milliseconds); 91 | 92 | // Store initialization 93 | isInitialized = true; 94 | } 95 | 96 | // Track query start timestamp 97 | const queryStartTimestamp = new Date().getTime(); 98 | 99 | // Run query 100 | const queryResult = await query(REPARTITION_QUERY); 101 | requestLogger.debug({ queryResult }); 102 | 103 | metrics.putMetric('QueryDuration', (new Date().getTime() - queryStartTimestamp), Unit.Milliseconds); 104 | 105 | return; 106 | } catch (err) { 107 | requestLogger.error(err); 108 | return err; 109 | } 110 | }) 111 | -------------------------------------------------------------------------------- /src/utils/logger.js: -------------------------------------------------------------------------------- 1 | import bunyan from 'bunyan'; 2 | 3 | let loggerInstance = null; 4 | 5 | export default class Logger { 6 | constructor (options={}) { 7 | this.level = options.level || process.env.LOG_LEVEL || 'info'; 8 | this.name = options.name || `parquet-repartitioner-logger` 9 | return this.getLogger(); 10 | } 11 | 12 | getLogger() { 13 | if (!loggerInstance || loggerInstance === null) { 14 | loggerInstance = bunyan.createLogger({ 15 | name: this.name, 16 | level: this.level, 17 | }); 18 | } 19 | return loggerInstance; 20 | } 21 | } 22 | --------------------------------------------------------------------------------