├── .gitignore
├── LICENSE
├── README.md
├── package-lock.json
├── package.json
├── serverless.yml
└── src
    ├── repartitionData.js
    └── utils
        └── logger.js


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | 
  9 | # Diagnostic reports (https://nodejs.org/api/report.html)
 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 11 | 
 12 | # Runtime data
 13 | pids
 14 | *.pid
 15 | *.seed
 16 | *.pid.lock
 17 | 
 18 | # Directory for instrumented libs generated by jscoverage/JSCover
 19 | lib-cov
 20 | 
 21 | # Coverage directory used by tools like istanbul
 22 | coverage
 23 | *.lcov
 24 | 
 25 | # nyc test coverage
 26 | .nyc_output
 27 | 
 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 29 | .grunt
 30 | 
 31 | # Bower dependency directory (https://bower.io/)
 32 | bower_components
 33 | 
 34 | # node-waf configuration
 35 | .lock-wscript
 36 | 
 37 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 38 | build/Release
 39 | 
 40 | # Dependency directories
 41 | node_modules/
 42 | jspm_packages/
 43 | 
 44 | # TypeScript v1 declaration files
 45 | typings/
 46 | 
 47 | # TypeScript cache
 48 | *.tsbuildinfo
 49 | 
 50 | # Optional npm cache directory
 51 | .npm
 52 | 
 53 | # Optional eslint cache
 54 | .eslintcache
 55 | 
 56 | # Microbundle cache
 57 | .rpt2_cache/
 58 | .rts2_cache_cjs/
 59 | .rts2_cache_es/
 60 | .rts2_cache_umd/
 61 | 
 62 | # Optional REPL history
 63 | .node_repl_history
 64 | 
 65 | # Output of 'npm pack'
 66 | *.tgz
 67 | 
 68 | # Yarn Integrity file
 69 | .yarn-integrity
 70 | 
 71 | # dotenv environment variables file
 72 | .env
 73 | .env.test
 74 | 
 75 | # parcel-bundler cache (https://parceljs.org/)
 76 | .cache
 77 | 
 78 | # Next.js build output
 79 | .next
 80 | 
 81 | # Nuxt.js build / generate output
 82 | .nuxt
 83 | dist
 84 | 
 85 | # Gatsby files
 86 | .cache/
 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js
 88 | # https://nextjs.org/blog/next-9-1#public-directory-support
 89 | # public
 90 | 
 91 | # vuepress build output
 92 | .vuepress/dist
 93 | 
 94 | # Serverless directories
 95 | .serverless/
 96 | 
 97 | # FuseBox cache
 98 | .fusebox/
 99 | 
100 | # DynamoDB Local files
101 | .dynamodb/
102 | 
103 | # TernJS port file
104 | .tern-port
105 | 
106 | .serverless
107 | .esbuild
108 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Tobi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # serverless-parquet-repartitioner
  2 | A AWS Lambda function for repartitioning parquet files in S3 via DuckDB queries.
  3 | 
  4 | ## Requirements
  5 | You'll need a current v3 version installation of the [Serverless Framework](https://serverless.com) on the machine you're planning to deploy the application from.
  6 | 
  7 | Also, you'll have to setup your AWS credentials according to the [Serverless docs](https://www.serverless.com/framework/docs/providers/aws/guide/credentials/).
  8 | 
  9 | ### Install dependencies
 10 | After cloning the repo, you'll need to install the dependencies via
 11 | 
 12 | ```bash
 13 | $ npm i
 14 | ```
 15 | 
 16 | ## Configuration
 17 | You can customize the configuration of the stack by setting some configuration values. Open up the [serverless.yml](serverless.yml) file, and search for `TODO` in your IDE. This will point you to the places you need to update according to your needs.
 18 | 
 19 | ### Mandatory configuration settings
 20 | 
 21 | * [S3 bucket name](serverless.yml#L18): You need to use the S3 bucket where your data that you want to repartition resides (e.g. `my-source-bucket`)
 22 | * [Custom repartitioning query](serverless.yml#L77): You can write flexible repartitioning queries in the DuckDB syntax. Have a look at the examples at the [httpfs extension docs](https://duckdb.org/docs/extensions/httpfs). You **need** to update this, as the template uses only example values!
 23 | 
 24 | ### Optional configuration settings
 25 | 
 26 | * [S3 region](serverless.yml#L79): The AWS region your S3 bucket is deployed to (if different from the region the Lambda funciton is deployed to)
 27 | * [The schedule](serverless.yml#L84): The actual schedule on why the Lambda function is run. Have a look at the [Serverless Framework docs](https://www.serverless.com/framework/docs/providers/aws/events/schedule) to find out what the potential settings are.
 28 | * [DuckDB memory limit](serverless.yml#L48): The memory limit is influenced by the function memory setting (automatically)
 29 | * [DuckDB threads count](serverless.yml#L75): Optionally set the max thread limit (on Lambda, this is set automatically by the amount of memory the functions has assigned), but with this setting you can influence how many files are writte per partition. If you set a lower thread count than available, this means that the computation will not use all available resources for the sake of being able to set the number of generated files! Ideally, rather align the amount of memory you assign to the Lambda function.
 30 | * [Lambda timeout](serverless.yml#L50): The maximum time a Lambda function can run is currently 15min / 900sec. This means that if your query takes longer than that, it will be terminated by the underlying Firecracker engine.
 31 | 
 32 | ### Using different source/target S3 buckets
 33 | If you're planning to use different S3 buckets as source and target for the data repartitioning, you need to adapt the `iamRoleStatements` settings of the function.
 34 | 
 35 | Here's an example with minimal privileges:
 36 | 
 37 | ```yaml
 38 | iamRoleStatements:
 39 |   # Source S3 bucket permissions
 40 |   - Effect: Allow
 41 |     Action:
 42 |       - s3:ListBucket
 43 |     Resource: 'arn:aws:s3:::my-source-bucket'
 44 |   - Effect: Allow
 45 |     Action:
 46 |       - s3:GetObject
 47 |     Resource: 'arn:aws:s3:::my-source-bucket/*'
 48 |   # Target S3 bucket permissions
 49 |   - Effect: Allow
 50 |     Action:
 51 |       - s3:ListBucket
 52 |       - s3:ListBucketMultipartUploads
 53 |     Resource: 'arn:aws:s3:::my-target-bucket'
 54 |   - Effect: Allow
 55 |     Action:
 56 |       - s3:PutObject
 57 |       - s3:AbortMultipartUpload
 58 |       - s3:ListMultipartUploadParts
 59 |     Resource: 'arn:aws:s3:::my-target-bucket/*'
 60 | ```
 61 | 
 62 | A query for this use case would look like this:
 63 | 
 64 | ```sql
 65 | COPY (SELECT * FROM parquet_scan('s3://my-source-bucket/input/*.parquet', HIVE_PARTITIONING = 1)) TO 's3://my-starget-bucket/output' (FORMAT PARQUET, PARTITION_BY (column1, column2, column3), ALLOW_OVERWRITE TRUE);
 66 | ```
 67 | 
 68 | ## Deployment
 69 | After you cloned this repository to your local machine and cd'ed in its directory, the application can be deployed like this (don't forget a `npm i` to install the dependencies!):
 70 | 
 71 | ```bash
 72 | $ sls deploy
 73 | ```
 74 | 
 75 | This will deploy the stack to the default AWS region `us-east-1`. In case you want to deploy the stack to a different region, you can specify a `--region` argument:
 76 | 
 77 | ```bash
 78 | $ sls deploy --region eu-central-1
 79 | ```
 80 | 
 81 | The deployment should take 2-3 minutes.
 82 | 
 83 | ## Checks and manual triggering
 84 | You can [manually invoke](https://www.serverless.com/framework/docs/providers/aws/cli-reference/invoke) the deployed Lambda function by running
 85 | 
 86 | ```bash
 87 | $ sls invoke -f repartitionData
 88 | ```
 89 | 
 90 | After that, you can [check the generated CloudWatch logs](https://www.serverless.com/framework/docs/providers/aws/cli-reference/logs) by issueing
 91 | 
 92 | ```bash
 93 | $ sls logs -f repartitionData
 94 | ```
 95 | 
 96 | If you don't see any `DUCKDB_NODEJS_ERROR` in the logs, everything ran successfully, and you can have a look at your S3 bucket for the newly generated parquet files.
 97 | 
 98 | ## Costs
 99 | Using this repository will generate costs in your AWS account. Please refer to the AWS pricing docs before deploying and running it.
100 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "parquet-repartitioner",
 3 |   "version": "0.1.0",
 4 |   "description": "Host your own website statistics analytics on AWS",
 5 |   "scripts": {
 6 |     "cfn-lint": "cfn-lint .serverless/cloudformation-template-update-stack.json",
 7 |     "package": "sls package",
 8 |     "test": "jest",
 9 |     "qa": "npm run package && npm run cfn-lint"
10 |   },
11 |   "repository": {
12 |     "type": "git",
13 |     "url": "git@github.com:tobilg/parquet-repartitioner.git"
14 |   },
15 |   "author": {
16 |     "name": "TobiLG",
17 |     "email": "tobilg@gmail.com",
18 |     "url": "https://github.com/tobilg"
19 |   },
20 |   "license": "MIT",
21 |   "bugs": {
22 |     "url": "https://github.com/tobilg/parquet-repartitioner/issues"
23 |   },
24 |   "homepage": "https://github.com/tobilg/parquet-repartitioner#readme",
25 |   "devDependencies": {
26 |     "esbuild": "^0.24.0",
27 |     "serverless-esbuild": "^1.54.3",
28 |     "serverless-iam-roles-per-function": "next",
29 |     "serverless-prune-plugin": "^2.0.2"
30 |   },
31 |   "dependencies": {
32 |     "aws-embedded-metrics": "^4.2.0",
33 |     "bunyan": "^1.8.15"
34 |   },
35 |   "jest": {
36 |     "transform": {
37 |       "^.+\\.js?$": "esbuild-jest"
38 |     }
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/serverless.yml:
--------------------------------------------------------------------------------
 1 | service: parquet-repartitioner
 2 | 
 3 | frameworkVersion: '3'
 4 | 
 5 | plugins:
 6 |   - serverless-iam-roles-per-function
 7 |   - serverless-prune-plugin
 8 |   - serverless-esbuild
 9 | 
10 | custom:
11 | 
12 |   s3:
13 |     # Hint: Make sure the bucket is in the region as the Lambda function,
14 |     # or you need to manually overwrite the region via the CUSTOM_AWS_REGION env var
15 |     # See https://aws.amazon.com/s3/faqs/
16 |     
17 |     # TODO: Change to real bucket name
18 |     bucketName: 'my-source-bucket'
19 | 
20 |   # esbuild settings
21 |   esbuild:
22 |     bundle: true
23 |     minify: false
24 |     exclude:
25 |       - duckdb
26 | 
27 |   # Prune plugin
28 |   prune:
29 |     automatic: true
30 |     number: 3
31 | 
32 | provider:
33 |   name: aws
34 |   runtime: nodejs18.x
35 |   region: ${opt:region, 'us-east-1'}
36 |   stage: ${opt:stage, 'prd'}
37 |   logRetentionInDays: 7
38 |   environment:
39 |     AWS_NODEJS_CONNECTION_REUSE_ENABLED: '1' # Enable HTTP keep-alive connections for the AWS SDK
40 |     STAGE: '${self:provider.stage}'
41 |     LOG_LEVEL: 'debug'
42 | 
43 | functions:
44 | 
45 |   repartitionData:
46 |     handler: src/repartitionData.handler
47 |     # TODO: Optionally configure the Lmbda memory size
48 |     memorySize: 10240
49 |     # TODO: Optionally set the Lambda timeout (900sec / 15min is the maximum)
50 |     timeout: 900
51 |     iamRoleStatements:
52 |       # Must have list permission
53 |       - Effect: Allow
54 |         Action:
55 |           - s3:ListBucket
56 |           - s3:ListBucketMultipartUploads
57 |         Resource: 'arn:aws:s3:::${self:custom.s3.bucketName}'
58 |       # For multipart upload see
59 |       # https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpuoverview.html#mpuAndPermissions
60 |       - Effect: Allow
61 |         Action:
62 |           - s3:GetObject
63 |           - s3:PutObject
64 |           - s3:AbortMultipartUpload
65 |           - s3:ListMultipartUploadParts
66 |         Resource: 'arn:aws:s3:::${self:custom.s3.bucketName}/*'
67 |     layers:
68 |       # Use the public DuckDB layer from https://github.com/tobilg/duckdb-nodejs-layer
69 |       - 'arn:aws:lambda:${self:provider.region}:041475135427:layer:duckdb-nodejs-x86:16'
70 |     environment:
71 |       DUCKDB_MEMORY_LIMIT: ${self:functions.repartitionData.memorySize}
72 |       # TODO: Optionally set the max thread limit (on Lambda, this is set automatically by the amount of memory the functions has assigned),
73 |       #       but with this setting you can influence how many files are writte per partition. If you set a lower thread count than available,
74 |       #       this means that the computation will not use all available resources!
75 |       # DUCKDB_THREADS: 2 # Example
76 |       # TODO: Write your repartitioning query below
77 |       REPARTITION_QUERY: COPY (SELECT * FROM parquet_scan('s3://${self:custom.s3.bucketName}/input/*.parquet', HIVE_PARTITIONING = 1)) TO 's3://${self:custom.s3.bucketName}/output' (FORMAT PARQUET, PARTITION_BY (column1, column2, column3), ALLOW_OVERWRITE TRUE);
78 |       # TODO: If you want to query a S3 bucket in another region than the Lambda function is deployed to
79 |       # CUSTOM_AWS_REGION: 'eu-central-1' # Example
80 |     events:
81 |       - schedule:
82 |           # TODO: Change schedule here if necessary
83 |           # See https://www.serverless.com/framework/docs/providers/aws/events/schedule for details
84 |           rate: rate(24 hours)
85 | 
86 | package:
87 |   individually: true
88 | 


--------------------------------------------------------------------------------
/src/repartitionData.js:
--------------------------------------------------------------------------------
  1 | import DuckDB from 'duckdb';
  2 | import { metricScope, Unit } from 'aws-embedded-metrics';
  3 | import Logger from './utils/logger';
  4 | 
  5 | // Instantiate logger
  6 | const logger = new Logger();
  7 | 
  8 | // Instantiate DuckDB
  9 | const duckDB = new DuckDB.Database(':memory:');
 10 | 
 11 | // Create connection
 12 | const connection = duckDB.connect();
 13 | 
 14 | // Store initialization
 15 | let isInitialized = false;
 16 | 
 17 | // Store AWS region
 18 | let region; 
 19 | 
 20 | // Promisify query method
 21 | const query = (query) => {
 22 |   return new Promise((resolve, reject) => {
 23 |     connection.all(query, (err, res) => {
 24 |       if (err) reject(err);
 25 |       resolve(res);
 26 |     })
 27 |   })
 28 | }
 29 | 
 30 | const {
 31 |   AWS_ACCESS_KEY_ID,
 32 |   AWS_SECRET_ACCESS_KEY,
 33 |   AWS_SESSION_TOKEN,
 34 |   AWS_REGION,
 35 |   DUCKDB_MEMORY_LIMIT,
 36 |   DUCKDB_THREADS,
 37 |   CUSTOM_AWS_REGION,
 38 |   REPARTITION_QUERY,
 39 | } = process.env;
 40 | 
 41 | // eslint-disable-next-line import/prefer-default-export
 42 | export const handler = metricScope(metrics => async (event, context) => {
 43 |   // Setup logger
 44 |   const requestLogger = logger.child({ requestId: context.awsRequestId });
 45 |   requestLogger.debug({ event, context });
 46 | 
 47 |   // Setup metrics
 48 |   metrics.putDimensions({ Service: 'QueryService' });
 49 |   metrics.setProperty('RequestId', context.awsRequestId);
 50 | 
 51 |   // Assign AWS region for query
 52 |   if (CUSTOM_AWS_REGION) {
 53 |     region = CUSTOM_AWS_REGION;
 54 |   } else {
 55 |     region = AWS_REGION;
 56 |   }
 57 | 
 58 |   try {
 59 |     // Check if DuckDB has been initalized
 60 |     if (!isInitialized) {
 61 |       const initialSetupStartTimestamp = new Date().getTime();
 62 |       
 63 |       // Set home directory
 64 |       await query(`SET home_directory='/tmp';`);
 65 |       // Load httpsfs
 66 |       await query(`INSTALL httpfs;`);
 67 |       await query(`LOAD httpfs;`);
 68 |       // New speedup option, see https://github.com/duckdb/duckdb/pull/5405
 69 |       await query(`SET enable_http_metadata_cache=true;`);
 70 |       // Set memory limit
 71 |       await query(`SET memory_limit='${parseInt((DUCKDB_MEMORY_LIMIT/1024).toFixed(0))}GB';`);
 72 |       // Set thread count
 73 |       if (DUCKDB_THREADS && DUCKDB_THREADS >= 1 && DUCKDB_THREADS <= 6) {
 74 |         await query(`SET threads TO ${DUCKDB_THREADS};`);
 75 |       }
 76 | 
 77 |       requestLogger.debug({ message: 'Initial setup done!' });
 78 |       metrics.putMetric('InitialSetupDuration', (new Date().getTime() - initialSetupStartTimestamp), Unit.Milliseconds);
 79 | 
 80 |       const awsSetupStartTimestamp = new Date().getTime();
 81 |       
 82 |       // Set AWS credentials
 83 |       // See https://docs.aws.amazon.com/lambda/latest/dg/configuration-envvars.html#configuration-envvars-runtime
 84 |       await query(`SET s3_region='${region}';`);
 85 |       await query(`SET s3_access_key_id='${AWS_ACCESS_KEY_ID}';`);
 86 |       await query(`SET s3_secret_access_key='${AWS_SECRET_ACCESS_KEY}';`);
 87 |       await query(`SET s3_session_token='${AWS_SESSION_TOKEN}';`);
 88 | 
 89 |       requestLogger.debug({ message: 'AWS setup done!' });
 90 |       metrics.putMetric('AWSSetupDuration', (new Date().getTime() - awsSetupStartTimestamp), Unit.Milliseconds);
 91 | 
 92 |       // Store initialization
 93 |       isInitialized = true;
 94 |     }
 95 | 
 96 |     // Track query start timestamp
 97 |     const queryStartTimestamp = new Date().getTime();
 98 | 
 99 |     // Run query
100 |     const queryResult = await query(REPARTITION_QUERY);
101 |     requestLogger.debug({ queryResult });
102 | 
103 |     metrics.putMetric('QueryDuration', (new Date().getTime() - queryStartTimestamp), Unit.Milliseconds);
104 | 
105 |     return;
106 |   } catch (err) {
107 |     requestLogger.error(err);
108 |     return err;
109 |   }
110 | })
111 | 


--------------------------------------------------------------------------------
/src/utils/logger.js:
--------------------------------------------------------------------------------
 1 | import bunyan from 'bunyan';
 2 | 
 3 | let loggerInstance = null;
 4 | 
 5 | export default class Logger {
 6 |   constructor (options={}) {
 7 |     this.level = options.level || process.env.LOG_LEVEL || 'info';
 8 |     this.name = options.name || `parquet-repartitioner-logger`
 9 |     return this.getLogger();
10 |   }
11 | 
12 |   getLogger() {
13 |     if (!loggerInstance || loggerInstance === null) {
14 |       loggerInstance = bunyan.createLogger({
15 |         name: this.name,
16 |         level: this.level,
17 |       });
18 |     }
19 |     return loggerInstance;
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------