├── .circleci └── config.yml ├── .editorconfig ├── .eslintignore ├── .eslintrc ├── .gitattributes ├── .gitignore ├── .husky ├── _ │ └── husky.sh ├── post-commit └── pre-commit ├── .npmrc ├── .prettierignore ├── .prettierrc.json ├── jest-dynamodb-config.js ├── jest.config.js ├── license ├── package.json ├── readme.md ├── renovate.json ├── src ├── blocker.test.ts ├── blocker.ts ├── clients.ts ├── ddb.ts ├── index.ts ├── parallel-scan-stream.test.ts ├── parallel-scan-stream.ts ├── parallel-scan.test.ts └── parallel-scan.ts ├── tsconfig.json └── wallaby.config.js /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | orbs: 2 | node: circleci/node@5.1.0 3 | aws-cli: circleci/aws-cli@3.2.0 4 | 5 | version: 2.1 6 | 7 | parameters: 8 | node_version: 9 | type: string 10 | default: '16.15.1-browsers' 11 | 12 | commands: 13 | install_deps: 14 | steps: 15 | - node/install-packages: 16 | pkg-manager: yarn 17 | cache-version: v2-all 18 | cache-only-lockfile: true 19 | app-dir: ~/repo 20 | override-ci-command: yarn install --pure-lockfile --no-progress 21 | 22 | jobs: 23 | build: 24 | executor: 25 | name: node/default 26 | tag: << pipeline.parameters.node_version >> 27 | environment: 28 | AWS_ACCESS_KEY_ID: 'fakeMyKeyId' 29 | AWS_SECRET_ACCESS_KEY: 'fakeMySecretKey' 30 | working_directory: ~/repo 31 | steps: 32 | - checkout 33 | - install_deps 34 | - run: yarn test 35 | - run: yarn type-check 36 | - run: yarn lint:ci 37 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | charset = utf-8 6 | trim_trailing_whitespace = true 7 | insert_final_newline = true 8 | indent_style = space 9 | indent_size = 2 10 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | lib/ 2 | renovate.json 3 | tsconfig.json 4 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "extends": ["@shelf/eslint-config/typescript"] 4 | } 5 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | *.js text eol=lf 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | coverage/ 3 | node_modules/ 4 | dist/ 5 | lib/ 6 | temp 7 | yarn.lock 8 | *.log 9 | .DS_Store 10 | !.husky/_/husky.sh 11 | *.draft.ts 12 | -------------------------------------------------------------------------------- /.husky/_/husky.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ -z "$husky_skip_init" ]; then 3 | debug () { 4 | if [ "$HUSKY_DEBUG" = "1" ]; then 5 | echo "husky (debug) - $1" 6 | fi 7 | } 8 | 9 | readonly hook_name="$(basename "$0")" 10 | debug "starting $hook_name..." 11 | 12 | if [ "$HUSKY" = "0" ]; then 13 | debug "HUSKY env variable is set to 0, skipping hook" 14 | exit 0 15 | fi 16 | 17 | if [ -f ~/.huskyrc ]; then 18 | debug "sourcing ~/.huskyrc" 19 | . ~/.huskyrc 20 | fi 21 | 22 | export readonly husky_skip_init=1 23 | sh -e "$0" "$@" 24 | exitCode="$?" 25 | 26 | if [ $exitCode != 0 ]; then 27 | echo "husky - $hook_name hook exited with code $exitCode (error)" 28 | fi 29 | 30 | exit $exitCode 31 | fi 32 | -------------------------------------------------------------------------------- /.husky/post-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | . "$(dirname "$0")/_/husky.sh" 3 | git update-index --again 4 | -------------------------------------------------------------------------------- /.husky/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | . "$(dirname "$0")/_/husky.sh" 3 | yarn lint-staged 4 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | package-lock=false 2 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | package.json 2 | -------------------------------------------------------------------------------- /.prettierrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 100, 3 | "singleQuote": true, 4 | "bracketSpacing": false 5 | } 6 | -------------------------------------------------------------------------------- /jest-dynamodb-config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | tables: [ 3 | { 4 | TableName: `files`, 5 | KeySchema: [{AttributeName: 'id', KeyType: 'HASH'}], 6 | AttributeDefinitions: [{AttributeName: 'id', AttributeType: 'S'}], 7 | ProvisionedThroughput: {ReadCapacityUnits: 1, WriteCapacityUnits: 1}, 8 | }, 9 | ], 10 | }; 11 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: '@shelf/jest-dynamodb', 3 | }; 4 | -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Gemshelf Inc. (shelf.io) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@shelf/dynamodb-parallel-scan", 3 | "version": "3.9.2", 4 | "description": "Scan large DynamoDB tables faster with parallelism", 5 | "keywords": [ 6 | "aws", 7 | "dynamodb", 8 | "parallel-scan" 9 | ], 10 | "repository": "shelfio/dynamodb-parallel-scan", 11 | "license": "MIT", 12 | "author": { 13 | "name": "Vlad Holubiev", 14 | "email": "vlad@shelf.io", 15 | "url": "shelf.io" 16 | }, 17 | "main": "lib/index.js", 18 | "types": "lib/index.d.ts", 19 | "files": [ 20 | "lib" 21 | ], 22 | "scripts": { 23 | "build": "rm -rf lib/ && yarn build:types && babel src --out-dir lib --ignore '**/*.test.ts' --extensions '.ts'", 24 | "build:types": "tsc --emitDeclarationOnly --declaration --isolatedModules false --declarationDir lib", 25 | "coverage": "jest --coverage", 26 | "lint": "eslint . --ext .js,.ts,.json --fix --quiet", 27 | "lint:ci": "eslint . --ext .js,.ts,.json --quiet", 28 | "prepack": "yarn build", 29 | "test": "export ENVIRONMENT=local && jest src --runInBand --forceExit", 30 | "type-check": "tsc --noEmit", 31 | "type-check:watch": "npm run type-check -- --watch" 32 | }, 33 | "lint-staged": { 34 | "*.{html,json,md,yml}": [ 35 | "prettier --write --ignore-path=./.eslintignore" 36 | ], 37 | "*.{ts,js}": [ 38 | "eslint --fix" 39 | ] 40 | }, 41 | "babel": { 42 | "extends": "@shelf/babel-config/backend" 43 | }, 44 | "prettier": "@shelf/prettier-config", 45 | "dependencies": { 46 | "debug": "4.3.4", 47 | "lodash.chunk": "4.2.0", 48 | "lodash.clonedeep": "4.5.0", 49 | "lodash.times": "4.3.2", 50 | "p-map": "4.0.0" 51 | }, 52 | "devDependencies": { 53 | "@aws-sdk/client-dynamodb": "3.538.0", 54 | "@aws-sdk/lib-dynamodb": "3.535.0", 55 | "@babel/cli": "7.26.4", 56 | "@babel/core": "7.26.0", 57 | "@shelf/babel-config": "2.0.1", 58 | "@shelf/eslint-config": "2.29.3", 59 | "@shelf/jest-dynamodb": "3.4.1", 60 | "@shelf/prettier-config": "1.0.0", 61 | "@shelf/tsconfig": "0.0.9", 62 | "@types/debug": "4.1.12", 63 | "@types/jest": "29.5.14", 64 | "@types/lodash.chunk": "4.2.9", 65 | "@types/lodash.clonedeep": "4.5.9", 66 | "@types/lodash.times": "4.3.9", 67 | "@types/node": "16", 68 | "eslint": "8.57.1", 69 | "husky": "8.0.3", 70 | "jest": "29.7.0", 71 | "lint-staged": "13.3.0", 72 | "prettier": "2.8.8", 73 | "typescript": "5.7.2" 74 | }, 75 | "peerDependencies": { 76 | "@aws-sdk/client-dynamodb": "3.x.x", 77 | "@aws-sdk/lib-dynamodb": "3.x.x" 78 | }, 79 | "engines": { 80 | "node": ">=16" 81 | }, 82 | "publishConfig": { 83 | "access": "public" 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # dynamodb-parallel-scan [![CircleCI](https://circleci.com/gh/shelfio/dynamodb-parallel-scan/tree/master.svg?style=svg)](https://circleci.com/gh/shelfio/dynamodb-parallel-scan/tree/master) ![](https://img.shields.io/badge/code_style-prettier-ff69b4.svg) [![npm (scoped)](https://img.shields.io/npm/v/@shelf/dynamodb-parallel-scan.svg)](https://www.npmjs.com/package/@shelf/dynamodb-parallel-scan) 2 | 3 | > Scan DynamoDB table concurrently (up to 1,000,000 segments), recursively read all items from every segment 4 | 5 | [A blog post going into details about this library.](https://vladholubiev.medium.com/how-to-scan-a-23-gb-dynamodb-table-in-1-minute-110730879e2b) 6 | 7 | ## Install 8 | 9 | ``` 10 | $ yarn add @shelf/dynamodb-parallel-scan 11 | ``` 12 | 13 | This library has 2 peer dependencies: 14 | 15 | - `@aws-sdk/client-dynamodb` 16 | - `@aws-sdk/lib-dynamodb` 17 | 18 | Make sure to install them alongside this library. 19 | 20 | ## Why this is better than a regular scan 21 | 22 | **Easily parallelize** scan requests to fetch all items from a table at once. 23 | This is useful when you need to scan a large table to find a small number of items that will fit the node.js memory. 24 | 25 | **Scan huge tables using async generator** or stream. 26 | And yes, it supports streams backpressure! 27 | Useful when you need to process a large number of items while you scan them. 28 | It allows receiving chunks of scanned items, wait until you process them, and then resume scanning when you're ready. 29 | 30 | ## Usage 31 | 32 | ### Fetch everything at once 33 | 34 | ```js 35 | const {parallelScan} = require('@shelf/dynamodb-parallel-scan'); 36 | 37 | (async () => { 38 | const items = await parallelScan( 39 | { 40 | TableName: 'files', 41 | FilterExpression: 'attribute_exists(#fileSize)', 42 | ExpressionAttributeNames: { 43 | '#fileSize': 'fileSize', 44 | }, 45 | ProjectionExpression: 'fileSize', 46 | }, 47 | {concurrency: 1000} 48 | ); 49 | 50 | console.log(items); 51 | })(); 52 | ``` 53 | 54 | ### Use as async generator (or streams) 55 | 56 | Note: `highWaterMark` determines items count threshold, so Parallel Scan can fetch `concurrency` \* 1MB more data even after highWaterMark was reached. 57 | 58 | ```js 59 | const {parallelScanAsStream} = require('@shelf/dynamodb-parallel-scan'); 60 | 61 | (async () => { 62 | const stream = await parallelScanAsStream( 63 | { 64 | TableName: 'files', 65 | FilterExpression: 'attribute_exists(#fileSize)', 66 | ExpressionAttributeNames: { 67 | '#fileSize': 'fileSize', 68 | }, 69 | ProjectionExpression: 'fileSize', 70 | }, 71 | {concurrency: 1000, chunkSize: 10000, highWaterMark: 10000} 72 | ); 73 | 74 | for await (const items of stream) { 75 | console.log(items); // 10k items here 76 | } 77 | })(); 78 | ``` 79 | 80 | ## Read 81 | 82 | - [Taking Advantage of Parallel Scans](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/bp-query-scan.html) 83 | - [Working with Scans](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Scan.html) 84 | 85 | ![](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/images/ParallelScan.png) 86 | 87 | ## Publish 88 | 89 | ```sh 90 | $ git checkout master 91 | $ yarn version 92 | $ yarn publish 93 | $ git push origin master --tags 94 | ``` 95 | 96 | ## License 97 | 98 | MIT © [Shelf](https://shelf.io) 99 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["github>shelfio/renovate-config-public"], 3 | "labels": ["backend"], 4 | "ignoreDeps": [ 5 | "cimg/node" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /src/blocker.test.ts: -------------------------------------------------------------------------------- 1 | import {inspect} from 'util'; 2 | import {Blocker} from './blocker'; 3 | 4 | it('should return same promise after sequent block() calls', () => { 5 | const blocker = new Blocker(); 6 | 7 | blocker.block(); 8 | 9 | const oldPromise = blocker.get(); 10 | 11 | blocker.block(); 12 | 13 | const newPromise = blocker.get(); 14 | 15 | expect(newPromise === oldPromise).toBeTruthy(); 16 | }); 17 | 18 | it('should return pending promise after block()', () => { 19 | const blocker = new Blocker(); 20 | 21 | blocker.block(); 22 | 23 | const inspectedBlocker = inspect(blocker.get()); 24 | const isInspectedBlockerPending = inspectedBlocker.includes('pending'); 25 | 26 | expect(isInspectedBlockerPending).toBeTruthy(); 27 | }); 28 | 29 | it('should return resolved promise after unblock()', () => { 30 | const blocker = new Blocker(); 31 | 32 | blocker.unblock(); 33 | 34 | const inspectedBlocker = inspect(blocker.get()); 35 | const isInspectedBlockerResolved = inspectedBlocker.includes('undefined'); 36 | 37 | expect(isInspectedBlockerResolved).toBeTruthy(); 38 | }); 39 | 40 | it('should return resolved promise in default state', () => { 41 | const blocker = new Blocker(); 42 | 43 | const inspectedBlocker = inspect(blocker.get()); 44 | const isInspectedBlockerResolved = inspectedBlocker.includes('undefined'); 45 | 46 | expect(isInspectedBlockerResolved).toBeTruthy(); 47 | }); 48 | 49 | it('should be blocked after block() call', () => { 50 | const blocker = new Blocker(); 51 | 52 | blocker.block(); 53 | 54 | expect(blocker.isBlocked()).toBeTruthy(); 55 | }); 56 | 57 | it('should not be blocked after unblock() call', () => { 58 | const blocker = new Blocker(); 59 | 60 | blocker.unblock(); 61 | blocker.unblock(); 62 | 63 | expect(blocker.isBlocked()).toBeFalsy(); 64 | }); 65 | 66 | it('should not be blocked in default state', () => { 67 | const blocker = new Blocker(); 68 | 69 | expect(blocker.isBlocked()).toBeFalsy(); 70 | }); 71 | -------------------------------------------------------------------------------- /src/blocker.ts: -------------------------------------------------------------------------------- 1 | export class Blocker { 2 | private _promise: Promise; 3 | private _promiseResolver: (...args: any[]) => any; 4 | private _isBlocked: boolean; 5 | 6 | constructor() { 7 | this._promise = Promise.resolve(); 8 | this._promiseResolver = () => {}; // eslint-disable-line @typescript-eslint/no-empty-function 9 | this._isBlocked = false; 10 | } 11 | 12 | block(): void { 13 | if (this._isBlocked) { 14 | return; 15 | } 16 | 17 | this._promise = new Promise(r => { 18 | this._promiseResolver = r; 19 | setTimeout(r, 2147483647); 20 | }); //TODO: Implement endless promise 21 | 22 | this._isBlocked = true; 23 | } 24 | 25 | unblock(): void { 26 | this._promiseResolver(); 27 | 28 | this._isBlocked = false; 29 | } 30 | 31 | get(): Promise { 32 | return this._promise; 33 | } 34 | 35 | isBlocked(): boolean { 36 | return this._isBlocked; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/clients.ts: -------------------------------------------------------------------------------- 1 | import {DynamoDBClient} from '@aws-sdk/client-dynamodb'; 2 | import {DynamoDBDocumentClient} from '@aws-sdk/lib-dynamodb'; 3 | 4 | export type Credentials = { 5 | accessKeyId: string; 6 | secretAccessKey: string; 7 | sessionToken: string; 8 | }; 9 | 10 | const isTest = process.env.JEST_WORKER_ID; 11 | const endpoint = process.env.DYNAMODB_ENDPOINT; 12 | const region = process.env.REGION; 13 | 14 | export const ddbv3Client = (credentials?: Credentials) => 15 | new DynamoDBClient({ 16 | ...(isTest && { 17 | endpoint: endpoint ?? 'http://localhost:8000', 18 | tls: false, 19 | region: region ?? 'local-env', 20 | }), 21 | credentials: getCredentials(credentials), 22 | }); 23 | 24 | export const ddbv3DocClient = (credentials?: Credentials) => 25 | DynamoDBDocumentClient.from(ddbv3Client(credentials)); 26 | 27 | const getCredentials = (credentials?: Credentials) => { 28 | if (credentials && Object.keys(credentials).length) { 29 | return credentials; 30 | } 31 | 32 | if (isTest) { 33 | return { 34 | accessKeyId: 'fakeMyKeyId', 35 | secretAccessKey: 'fakeSecretAccessKey', 36 | }; 37 | } 38 | 39 | return undefined; 40 | }; 41 | -------------------------------------------------------------------------------- /src/ddb.ts: -------------------------------------------------------------------------------- 1 | import {DescribeTableCommand} from '@aws-sdk/client-dynamodb'; 2 | import {BatchWriteCommand, ScanCommand} from '@aws-sdk/lib-dynamodb'; 3 | import type {DynamoDBDocument} from '@aws-sdk/lib-dynamodb'; 4 | import type {DynamoDBClient} from '@aws-sdk/client-dynamodb'; 5 | import type {DynamoDBDocumentClient} from '@aws-sdk/lib-dynamodb'; 6 | import type { 7 | BatchWriteCommandInput, 8 | BatchWriteCommandOutput, 9 | ScanCommandInput, 10 | ScanCommandOutput, 11 | } from '@aws-sdk/lib-dynamodb'; 12 | 13 | export type Credentials = { 14 | accessKeyId: string; 15 | secretAccessKey: string; 16 | sessionToken: string; 17 | }; 18 | 19 | export function scan( 20 | params: ScanCommandInput, 21 | client: DynamoDBClient | DynamoDBDocument 22 | ): Promise { 23 | const command = new ScanCommand(params); 24 | 25 | // @ts-ignore 26 | return client.send(command); 27 | } 28 | 29 | export async function getTableItemsCount( 30 | tableName: string, 31 | client: DynamoDBClient | DynamoDBDocument 32 | ): Promise { 33 | const command = new DescribeTableCommand({TableName: tableName}); 34 | // @ts-ignore 35 | const resp = await client.send(command); 36 | 37 | return resp.Table!.ItemCount!; 38 | } 39 | 40 | export function insertMany( 41 | { 42 | items, 43 | tableName, 44 | }: { 45 | items: any[]; 46 | tableName: string; 47 | }, 48 | docClient: DynamoDBDocumentClient 49 | ): Promise { 50 | const params: BatchWriteCommandInput['RequestItems'] = { 51 | [tableName]: items.map(item => { 52 | return { 53 | PutRequest: { 54 | Item: item, 55 | }, 56 | }; 57 | }), 58 | }; 59 | 60 | return batchWrite(params, docClient); 61 | } 62 | 63 | function batchWrite( 64 | items: BatchWriteCommandInput['RequestItems'], 65 | docClient: DynamoDBDocumentClient 66 | ): Promise { 67 | const command = new BatchWriteCommand({ 68 | RequestItems: items, 69 | ReturnConsumedCapacity: 'NONE', 70 | ReturnItemCollectionMetrics: 'NONE', 71 | }); 72 | 73 | // @ts-ignore 74 | return docClient.send(command); 75 | } 76 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export * from './parallel-scan'; 2 | export * from './parallel-scan-stream'; 3 | -------------------------------------------------------------------------------- /src/parallel-scan-stream.test.ts: -------------------------------------------------------------------------------- 1 | jest.setTimeout(25000); 2 | 3 | import {uniq} from 'lodash'; 4 | import * as ddbHelpers from './ddb'; 5 | import {parallelScanAsStream} from './parallel-scan-stream'; 6 | import {ddbv3DocClient} from './clients'; 7 | 8 | function delay(ms: number) { 9 | return new Promise(r => { 10 | setTimeout(r, ms); 11 | }); 12 | } 13 | 14 | describe('parallelScanAsStream', () => { 15 | const files = [ 16 | {id: 'some-file-id-1', isLarge: false}, 17 | {id: 'some-file-id-2', isLarge: false}, 18 | {id: 'some-file-id-3', fileSize: 100, isLarge: false}, 19 | {id: 'some-file-id-4', isLarge: false}, 20 | {id: 'some-file-id-5', isLarge: false}, 21 | {id: 'some-file-id-6', fileSize: 200, isLarge: false}, 22 | {id: 'some-file-id-7', isLarge: false}, 23 | {id: 'some-file-id-8', isLarge: false}, 24 | {id: 'some-file-id-9', fileSize: 300, isLarge: false}, 25 | {id: 'some-file-id-10', isLarge: false}, 26 | ]; 27 | 28 | beforeAll(async () => { 29 | const docClient = ddbv3DocClient(); 30 | await ddbHelpers.insertMany({items: files, tableName: 'files'}, docClient); 31 | }); 32 | 33 | it('should stream items with chunks of 2 with concurrency 1', async () => { 34 | const stream = await parallelScanAsStream( 35 | { 36 | TableName: 'files', 37 | FilterExpression: 'attribute_exists(#id) and #isLarge = :false', 38 | ExpressionAttributeNames: { 39 | '#id': 'id', 40 | '#isLarge': 'isLarge', 41 | }, 42 | ExpressionAttributeValues: { 43 | ':false': false, 44 | }, 45 | }, 46 | {concurrency: 1, chunkSize: 2} 47 | ); 48 | 49 | for await (const chunk of stream) { 50 | expect(chunk).toHaveLength(2); 51 | } 52 | }); 53 | 54 | it('should stream items with chunks of 2 with concurrency 5', async () => { 55 | const stream = await parallelScanAsStream( 56 | { 57 | TableName: 'files', 58 | FilterExpression: 'attribute_exists(#id) and #isLarge = :false', 59 | ExpressionAttributeNames: { 60 | '#id': 'id', 61 | '#isLarge': 'isLarge', 62 | }, 63 | ExpressionAttributeValues: { 64 | ':false': false, 65 | }, 66 | }, 67 | {concurrency: 5, chunkSize: 2} 68 | ); 69 | 70 | const allItems = []; 71 | 72 | for await (const chunk of stream) { 73 | allItems.push(...chunk); 74 | 75 | expect(chunk.length >= 1).toBeTruthy(); 76 | } 77 | 78 | expect(allItems).toHaveLength(10); 79 | }); 80 | 81 | it('should pause calling dynamodb after highWaterMark reached', async () => { 82 | const scanSpy = jest.spyOn(ddbHelpers, 'scan'); 83 | 84 | const megaByte = Buffer.alloc(1024 * 390); // Maximum allowed item size in ddb is 400KB 85 | const megaByteString = megaByte.toString(); 86 | 87 | const docClient = ddbv3DocClient(); 88 | await ddbHelpers.insertMany( 89 | { 90 | items: [ 91 | {id: 'some-big-file-id-1', isLarge: true, payload: megaByteString}, 92 | {id: 'some-big-file-id-2', isLarge: true, payload: megaByteString}, 93 | {id: 'some-big-file-id-3', isLarge: true, payload: megaByteString}, 94 | {id: 'some-big-file-id-4', isLarge: true, payload: megaByteString}, 95 | {id: 'some-big-file-id-5', isLarge: true, payload: megaByteString}, 96 | ], 97 | tableName: 'files', 98 | }, 99 | docClient 100 | ); 101 | 102 | const stream = await parallelScanAsStream( 103 | { 104 | TableName: 'files', 105 | FilterExpression: 'attribute_exists(#id) and #isLarge = :true', 106 | ExpressionAttributeNames: { 107 | '#id': 'id', 108 | '#isLarge': 'isLarge', 109 | }, 110 | ExpressionAttributeValues: { 111 | ':true': true, 112 | }, 113 | }, 114 | {concurrency: 1, chunkSize: 1, highWaterMark: 1} 115 | ); 116 | 117 | const scanCallsByIteration = []; 118 | for await (const _ of stream) { 119 | expect(_).not.toBeUndefined(); 120 | 121 | await delay(1000); 122 | 123 | scanCallsByIteration.push(scanSpy.mock.calls.length); 124 | } 125 | 126 | const scanCallsByIterationUniq = uniq(scanCallsByIteration); 127 | 128 | expect(scanCallsByIterationUniq).toEqual([1, 2]); 129 | }); 130 | }); 131 | -------------------------------------------------------------------------------- /src/parallel-scan-stream.ts: -------------------------------------------------------------------------------- 1 | import {Readable} from 'stream'; 2 | import cloneDeep from 'lodash.clonedeep'; 3 | import times from 'lodash.times'; 4 | import chunk from 'lodash.chunk'; 5 | import getDebugger from 'debug'; 6 | import type {DynamoDBDocument, ScanCommandInput} from '@aws-sdk/lib-dynamodb'; 7 | import type {ScanCommandOutput} from '@aws-sdk/lib-dynamodb'; 8 | import type {DynamoDBClient} from '@aws-sdk/client-dynamodb'; 9 | import type {Credentials} from './ddb'; 10 | import {getTableItemsCount, scan} from './ddb'; 11 | import {Blocker} from './blocker'; 12 | import {ddbv3Client} from './clients'; 13 | 14 | const debug = getDebugger('ddb-parallel-scan'); 15 | 16 | let totalTableItemsCount = 0; 17 | let totalScannedItemsCount = 0; 18 | let totalFetchedItemsCount = 0; 19 | 20 | export async function parallelScanAsStream( 21 | scanParams: ScanCommandInput, 22 | { 23 | concurrency, 24 | chunkSize, 25 | highWaterMark = Number.MAX_SAFE_INTEGER, 26 | credentials, 27 | client, 28 | }: { 29 | concurrency: number; 30 | chunkSize: number; 31 | highWaterMark?: number; 32 | credentials?: Credentials; 33 | client?: DynamoDBClient | DynamoDBDocument; 34 | } 35 | ): Promise { 36 | const ddbClient = client ?? ddbv3Client(credentials); 37 | totalTableItemsCount = await getTableItemsCount(scanParams.TableName!, ddbClient); 38 | 39 | const segments: number[] = times(concurrency); 40 | 41 | const blocker = new Blocker(); 42 | 43 | const stream = new Readable({ 44 | objectMode: true, 45 | highWaterMark, 46 | read() { 47 | if (blocker.isBlocked() && this.readableLength - chunkSize < this.readableHighWaterMark) { 48 | blocker.unblock(); 49 | } 50 | 51 | return; 52 | }, 53 | }); 54 | 55 | debug( 56 | `Started parallel scan with ${concurrency} threads. Total items count: ${totalTableItemsCount}` 57 | ); 58 | 59 | Promise.all( 60 | segments.map((_, segmentIndex) => 61 | getItemsFromSegment({ 62 | scanParams, 63 | stream, 64 | concurrency, 65 | segmentIndex, 66 | chunkSize, 67 | blocker, 68 | client: ddbClient, 69 | }) 70 | ) 71 | ).then(() => { 72 | // mark that there will be nothing else pushed into a stream 73 | stream.push(null); 74 | }); 75 | 76 | return stream; 77 | } 78 | 79 | async function getItemsFromSegment({ 80 | scanParams, 81 | stream, 82 | concurrency, 83 | segmentIndex, 84 | chunkSize, 85 | blocker, 86 | client, 87 | }: { 88 | scanParams: ScanCommandInput; 89 | stream: Readable; 90 | concurrency: number; 91 | segmentIndex: number; 92 | chunkSize: number; 93 | blocker: Blocker; 94 | client: DynamoDBClient | DynamoDBDocument; 95 | }): Promise { 96 | let segmentItems: ScanCommandOutput['Items'] = []; 97 | let ExclusiveStartKey: ScanCommandInput['ExclusiveStartKey']; 98 | 99 | const params: ScanCommandInput = { 100 | ...cloneDeep(scanParams), 101 | Segment: segmentIndex, 102 | TotalSegments: concurrency, 103 | }; 104 | 105 | debug(`[${segmentIndex}/${concurrency}][start]`, {ExclusiveStartKey}); 106 | 107 | do { 108 | await blocker.get(); 109 | 110 | const now: number = Date.now(); 111 | 112 | if (ExclusiveStartKey) { 113 | params.ExclusiveStartKey = ExclusiveStartKey; 114 | } 115 | 116 | const {Items, LastEvaluatedKey, ScannedCount} = await scan(params, client); 117 | ExclusiveStartKey = LastEvaluatedKey; 118 | totalScannedItemsCount += ScannedCount!; 119 | 120 | debug( 121 | `(${Math.round((totalScannedItemsCount / totalTableItemsCount) * 100)}%) ` + 122 | `[${segmentIndex}/${concurrency}] [time:${Date.now() - now}ms] ` + 123 | `[fetched:${Items!.length}] ` + 124 | `[total (fetched/scanned/table-size):${totalFetchedItemsCount}/${totalScannedItemsCount}/${totalTableItemsCount}]` 125 | ); 126 | 127 | segmentItems = segmentItems.concat(Items!); 128 | 129 | if (segmentItems.length < chunkSize) { 130 | continue; 131 | } 132 | 133 | for (const itemsOfChunkSize of chunk(segmentItems, chunkSize)) { 134 | const isUnderHighWaterMark = stream.push(itemsOfChunkSize); 135 | totalFetchedItemsCount += itemsOfChunkSize.length; 136 | 137 | if (!isUnderHighWaterMark) { 138 | blocker.block(); 139 | } 140 | } 141 | 142 | segmentItems = []; 143 | } while (ExclusiveStartKey); 144 | 145 | if (segmentItems.length) { 146 | stream.push(segmentItems); 147 | totalFetchedItemsCount += segmentItems.length; 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/parallel-scan.test.ts: -------------------------------------------------------------------------------- 1 | import {insertMany} from './ddb'; 2 | import {parallelScan} from './parallel-scan'; 3 | import {ddbv3DocClient} from './clients'; 4 | 5 | describe('parallelScan', () => { 6 | const files = [ 7 | {id: 'some-file-id-1', isLarge: false}, 8 | {id: 'some-file-id-2', isLarge: false}, 9 | {id: 'some-file-id-3', fileSize: 100, isLarge: false}, 10 | {id: 'some-file-id-4', isLarge: false}, 11 | {id: 'some-file-id-5', isLarge: false}, 12 | {id: 'some-file-id-6', fileSize: 200, isLarge: false}, 13 | {id: 'some-file-id-7', isLarge: false}, 14 | {id: 'some-file-id-8', isLarge: false}, 15 | {id: 'some-file-id-9', fileSize: 300, isLarge: false}, 16 | {id: 'some-file-id-10', isLarge: false}, 17 | ]; 18 | 19 | beforeAll(async () => { 20 | const docClient = ddbv3DocClient(); 21 | await insertMany({items: files, tableName: 'files'}, docClient); 22 | }); 23 | 24 | it('should return all items with concurrency 1', async () => { 25 | const items = await parallelScan( 26 | { 27 | TableName: 'files', 28 | FilterExpression: 'attribute_exists(#id) and #isLarge = :false', 29 | ExpressionAttributeNames: { 30 | '#id': 'id', 31 | '#isLarge': 'isLarge', 32 | }, 33 | ExpressionAttributeValues: { 34 | ':false': false, 35 | }, 36 | }, 37 | {concurrency: 1} 38 | ); 39 | 40 | expect(items).toHaveLength(10); 41 | 42 | for (const item of items!) { 43 | expect(files).toContainEqual(item); 44 | } 45 | }); 46 | 47 | it('should return all items with concurrency 50', async () => { 48 | const items = await parallelScan( 49 | { 50 | TableName: 'files', 51 | FilterExpression: 'attribute_exists(#id) and #isLarge = :false', 52 | ExpressionAttributeNames: { 53 | '#id': 'id', 54 | '#isLarge': 'isLarge', 55 | }, 56 | ExpressionAttributeValues: { 57 | ':false': false, 58 | }, 59 | }, 60 | {concurrency: 1} 61 | ); 62 | 63 | expect(items).toHaveLength(10); 64 | 65 | for (const item of items!) { 66 | expect(files).toContainEqual(item); 67 | } 68 | }); 69 | 70 | it('should return 3 items with concurrency 50', async () => { 71 | const items = await parallelScan( 72 | { 73 | TableName: 'files', 74 | FilterExpression: 'attribute_exists(#fileSize) and #isLarge = :false', 75 | ExpressionAttributeNames: { 76 | '#fileSize': 'fileSize', 77 | '#isLarge': 'isLarge', 78 | }, 79 | ExpressionAttributeValues: { 80 | ':false': false, 81 | }, 82 | }, 83 | {concurrency: 1} 84 | ); 85 | 86 | expect(items).toHaveLength(3); 87 | 88 | for (const item of items!) { 89 | expect(files).toContainEqual(item); 90 | } 91 | }); 92 | }); 93 | -------------------------------------------------------------------------------- /src/parallel-scan.ts: -------------------------------------------------------------------------------- 1 | import cloneDeep from 'lodash.clonedeep'; 2 | import times from 'lodash.times'; 3 | import getDebugger from 'debug'; 4 | import type {ScanCommandInput, ScanCommandOutput} from '@aws-sdk/lib-dynamodb'; 5 | import type {Credentials} from './ddb'; 6 | import type {DynamoDBClient} from '@aws-sdk/client-dynamodb'; 7 | import type {DynamoDBDocument} from '@aws-sdk/lib-dynamodb'; 8 | import {getTableItemsCount, scan} from './ddb'; 9 | import {ddbv3Client} from './clients'; 10 | 11 | const debug = getDebugger('ddb-parallel-scan'); 12 | 13 | let totalTableItemsCount = 0; 14 | let totalScannedItemsCount = 0; 15 | let totalFetchedItemsCount = 0; 16 | 17 | export async function parallelScan( 18 | scanParams: ScanCommandInput, 19 | { 20 | concurrency, 21 | credentials, 22 | client, 23 | }: {concurrency: number; credentials?: Credentials; client?: DynamoDBClient | DynamoDBDocument} 24 | ): Promise { 25 | const ddbClient = client ?? ddbv3Client(credentials); 26 | totalTableItemsCount = await getTableItemsCount(scanParams.TableName!, ddbClient); 27 | 28 | const segments: number[] = times(concurrency); 29 | const totalItems: ScanCommandOutput['Items'] = []; 30 | 31 | debug( 32 | `Started parallel scan with ${concurrency} threads. Total items count: ${totalTableItemsCount}` 33 | ); 34 | 35 | await Promise.all( 36 | segments.map(async (_, segmentIndex) => { 37 | const segmentItems = await getItemsFromSegment(scanParams, { 38 | concurrency, 39 | segmentIndex, 40 | client: ddbClient, 41 | }); 42 | 43 | for (const segmentItem of segmentItems!) { 44 | totalItems.push(segmentItem); 45 | } 46 | totalFetchedItemsCount += segmentItems!.length; 47 | }) 48 | ); 49 | 50 | debug(`Finished parallel scan with ${concurrency} threads. Fetched ${totalItems.length} items`); 51 | 52 | return totalItems; 53 | } 54 | 55 | async function getItemsFromSegment( 56 | scanParams: ScanCommandInput, 57 | { 58 | concurrency, 59 | segmentIndex, 60 | client, 61 | }: {concurrency: number; segmentIndex: number; client: DynamoDBClient | DynamoDBDocument} 62 | ): Promise { 63 | const segmentItems: ScanCommandOutput['Items'] = []; 64 | let ExclusiveStartKey: ScanCommandInput['ExclusiveStartKey']; 65 | 66 | const params: ScanCommandInput = { 67 | ...cloneDeep(scanParams), 68 | Segment: segmentIndex, 69 | TotalSegments: concurrency, 70 | }; 71 | 72 | debug(`[${segmentIndex}/${concurrency}][start]`, {ExclusiveStartKey}); 73 | 74 | do { 75 | const now: number = Date.now(); 76 | 77 | if (ExclusiveStartKey) { 78 | params.ExclusiveStartKey = ExclusiveStartKey; 79 | } 80 | 81 | const {Items, LastEvaluatedKey, ScannedCount} = await scan(params, client); 82 | ExclusiveStartKey = LastEvaluatedKey; 83 | totalScannedItemsCount += ScannedCount!; 84 | 85 | for (const item of Items!) { 86 | segmentItems.push(item); 87 | } 88 | 89 | debug( 90 | `(${Math.round((totalScannedItemsCount / totalTableItemsCount) * 100)}%) ` + 91 | `[${segmentIndex}/${concurrency}] [time:${Date.now() - now}ms] ` + 92 | `[fetched:${Items!.length}] ` + 93 | `[total (fetched/scanned/table-size):${totalFetchedItemsCount}/${totalScannedItemsCount}/${totalTableItemsCount}]` 94 | ); 95 | } while (ExclusiveStartKey); 96 | 97 | return segmentItems; 98 | } 99 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@shelf/tsconfig/backend", 3 | "exclude": ["node_modules"], 4 | "include": ["src"] 5 | } 6 | 7 | -------------------------------------------------------------------------------- /wallaby.config.js: -------------------------------------------------------------------------------- 1 | module.exports = wallaby => { 2 | process.env.NODE_ENV = 'test'; 3 | 4 | return { 5 | testFramework: 'jest', 6 | files: ['package.json', 'src/**/*.ts', '!src/**/*.test.ts'], 7 | tests: ['src/**/*.test.ts'], 8 | env: { 9 | type: 'node', 10 | runner: 'node', 11 | }, 12 | compilers: { 13 | 'src/**/*.js': wallaby.compilers.babel(), 14 | '**/*.ts?(x)': wallaby.compilers.typeScript(), 15 | }, 16 | setup(wallaby) { 17 | wallaby.testFramework.configure(require('./package.json').jest); 18 | 19 | process.env.TZ = 'UTC'; 20 | }, 21 | preprocessors: { 22 | '**/*.js': file => 23 | require('@babel/core').transform(file.content, { 24 | sourceMap: true, 25 | compact: false, 26 | filename: file.path, 27 | }), 28 | }, 29 | }; 30 | }; 31 | --------------------------------------------------------------------------------