├── .github └── workflows │ └── github-backup.yml ├── .gitignore ├── LICENCE ├── Makefile ├── README.md ├── dist └── .gitkeep ├── lib └── __init__.py ├── requirements.txt ├── src └── DynamoToES │ └── index.py └── update_mapping.py /.github/workflows/github-backup.yml: -------------------------------------------------------------------------------- 1 | name: Mirror repo to S3 2 | on: 3 | push: 4 | branches: 5 | - develop 6 | jobs: 7 | s3Backup: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v2 11 | - name: S3 Backup 12 | uses: peter-evans/s3-backup@v1 13 | env: 14 | ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY }} 15 | SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY }} 16 | MIRROR_TARGET: bfansports-github-backup/dynamodb-to-elasticsearch 17 | with: 18 | args: --overwrite --remove 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/python 2 | 3 | ### Python ### 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/* 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .coverage.* 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | *,cover 44 | 45 | # Translations 46 | *.mo 47 | *.pot 48 | 49 | # Django stuff: 50 | *.log 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | # PyBuilder 56 | target/ 57 | 58 | # VirtualEnv 59 | venv/ 60 | .env 61 | dev_es_creds 62 | lib/env.py 63 | lib/table_mapping.json 64 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 BFan Sports - Sport Archive, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PROFILE := 2 | EVENT := 3 | DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) 4 | 5 | # Helper functions 6 | FILTER_OUT = $(foreach v,$(2),$(if $(findstring $(1),$(v)),,$(v))) 7 | TITLE_CASE = $(shell echo $1 | cut -c1 | tr '[[:lower:]]' '[[:upper:]]')$(shell echo $1 | cut -c2-) 8 | 9 | .PHONY: help clean dist create/% run/% deploy deploy/% _check-desc _check-vers update_mapping 10 | .SILENT: help 11 | 12 | help: 13 | echo "SA-LAMBDA MAKEFILE FUNCTIONS" 14 | echo "----------------------------------------------------------" 15 | echo "(Add VERBOSE=1 for verbose output)" 16 | echo "----------------------------------------------------------" 17 | echo "Run all tests: make test" 18 | echo "Run a specific test: make test/TEST" 19 | echo "----------------------------------------------------------" 20 | echo "Create AWS function: make create/FUNCTION DESC='Func description'" 21 | echo "Package all functions: make dist" 22 | echo "Package a function: make dist/FUNCTION" 23 | echo "Deploy all functions: make deploy [ENV=prod] - Default ENV=dev" 24 | echo "Deploy a function: make deploy/FUNCTION [ENV=prod]" 25 | echo "Setup environment: make env [ENV=environment]" 26 | echo "Set function MEM size: make setmem/FUNCTION SIZE=[size]" 27 | echo "----------------------------------------------------------" 28 | 29 | all: dist 30 | 31 | create/%: dist/%.zip _check-desc .env 32 | if [ ! -n "${AWSENV_NAME}" ]; then \ 33 | echo "No AWSENV_NAME environment variable declared. Set it up and retry. This is used to pull the credential file from the correct bucket. e.g: dev, dev-eu"; \ 34 | exit 1; \ 35 | fi; 36 | aws $(if ${PROFILE},--profile ${PROFILE},) s3 cp $< s3://${AWS_BUCKET_CODE}/lambda/$( DESC='Awesome function that does great things!'"; 107 | @false; 108 | endif 109 | 110 | _check-size: 111 | ifndef SIZE 112 | @echo "You must provide a size for your function! See lambda console and function configuration for list of memory."; 113 | @echo "e.g: make setmem/ SIZE=512"; 114 | @false; 115 | endif 116 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DynamoDB to ElasticSearch 2 | 3 | This package allow you to easily ZIP a Lambda function and start processing DynamoDB Streams in order to index your DynamoDB objects in ElasticSearch. 4 | 5 | It processes the following events: 6 | 7 | - INSERT 8 | - REMOVE 9 | - MODIFY 10 | 11 | We force an index refresh for each event, for close to realtime indexing. 12 | 13 | The DynamoDB JSON objects are unmarshaled and types are correctly converted. (Binary types have never been tested though) 14 | 15 | List of numbers and strings are converted to list of Strings. ES doesn't allow arrays of different data types. 16 | 17 | Blog reference explaining what we're doing here: https://aws.amazon.com/blogs/compute/indexing-amazon-dynamodb-content-with-amazon-elasticsearch-service-using-aws-lambda/ 18 | 19 | Unfortunatly, AWS removed the Lambda Blueprint doing what we are doing here. So we've done it ourself again. 20 | 21 | # Get started 22 | 23 | The deployment process is done through the Makefile. 24 | 25 | You need to declare the following environment variables to get started: 26 | 27 | - AWS_BUCKET_CODE: Where your Lambda function will go in AWS S3 28 | - IAM_ROLE: The IAM role you created for the Lambda function to run with 29 | - ENV: The environment you're running: DEV, QA, PROD, STAGE, ... This is used to pull the correct config file from S3 30 | - PROFILE (optional): The AWS profile to use to deploy. Will use default profile by default ... 31 | 32 | Obvisouly you need your AWS environment setup correctly. 33 | 34 | ## Create a config file 35 | 36 | Create a simple file named `${ENV}_es_creds` and put the following in it: 37 | 38 | ``` 39 | ES_ENDPOINT='https://search-esclust-esclus-xxxxx-xxxxxx.{region}.es.amazonaws.com' 40 | ``` 41 | 42 | This file just contains the endpoint to your cluster. You should have one file per environment. 43 | 44 | Upload it to your bucket where the Lambda function will also be uploaded. 45 | 46 | Before Zipping the function, we will download that file locally and will inject it in the Lambda function as the file `lib/env.py`. The function depends on it and `import` this file. 47 | 48 | That allows you to NOT hardcode your endpoint. 49 | This way you can deploy several functions for different clusters and environments without touching the code. 50 | 51 | Just set your environment variables `ENV` correctly, name your file right, upload it and that's it. 52 | 53 | ## Create the function 54 | 55 | The first time you need to create the function in AWS Lambda. 56 | 57 | ``` 58 | make create/DynamoToES DESC="Process DynamoDB stream to ES" 59 | ``` 60 | 61 | This will download your config file from S3, install all the Python packages in the `build` folder, ZIP the whole thing, upload the ZIP file to S3 and create your Lambda function. The default is 128MB. You can change the memory of your function with the Makefile or in the console. 62 | 63 | ## Update the function 64 | 65 | Let's say you make some changes to the code. 66 | 67 | ``` 68 | make deploy/DynamoToES DESC="Process DynamoDB stream to ES" 69 | ``` 70 | 71 | That will update the ZIP and refresh your Lambda function. 72 | 73 | ## Create and update the mapping 74 | 75 | Why do we need a mapping ? 76 | 77 | There is an issue with the lambda, dynamo streams doesn't ensure that your keys will be in the 'right' order. Most of the time dynamodb stream gives object with primary key first and then secondary. BUT it doesn't do it 100% of the time. That's why we came with the idea that a mapping to fix that was needed. 78 | 79 | What does the mapping script do ? 80 | 81 | It go through all your dynamodb tables, looking for enabled dynamo streams, list them, get mapping of each table and store it into a json. This json will then be used to check if the parameters are sent in the right order. If not, use the right keys according to the mapping. 82 | 83 | Is the mapping mandatory ? 84 | 85 | No, even without the mapping the function will works like before, trusting dynamodb streams for keys order. 86 | 87 | How do you update the mapping ? 88 | 89 | Use the script: 90 | 91 | 92 | ``` 93 | ./update_mapping.py 94 | ``` 95 | 96 | This script gets all dynamo streams linked to the function DynamoToES. If your function 97 | is named differently, change it into the script. 98 | 99 | It create a file `lib/table_mappping.json`. 100 | 101 | ## Next 102 | 103 | Now your Lambda function is created, head to AWS Lambda, find the function we just created and click `Triggers`. 104 | Now add triggers for all the DynamoDB tables you want to process. 105 | 106 | Your dynamoDB tables must have Dynamo Stream activated. 107 | 108 | Check your CloudWatch logs to make sure your function processes things correctly! 109 | 110 | enjoy 111 | 112 | 113 | -------------------------------------------------------------------------------- /dist/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bfansports/dynamodb-to-elasticsearch/616431f14c5fc9d02333bf3cbb6f0d6a6568a707/dist/.gitkeep -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["common"] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.34.116 2 | requests==2.32.3 3 | requests-aws4auth==1.2.3 4 | elasticsearch==7.17.9 -------------------------------------------------------------------------------- /src/DynamoToES/index.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import json 4 | import re 5 | import boto3 6 | from lib import env 7 | from elasticsearch import Elasticsearch, RequestsHttpConnection 8 | from requests_aws4auth import AWS4Auth 9 | import json 10 | import os.path 11 | 12 | 13 | reserved_fields = [ "uid", "_id", "_type", "_source", "_all", "_parent", "_fieldnames", "_routing", "_index", "_size", "_timestamp", "_ttl"] 14 | 15 | 16 | # Process DynamoDB Stream records and insert the object in ElasticSearch 17 | # Use the Table name as index and doc_type name 18 | # Force index refresh upon all actions for close to realtime reindexing 19 | # Use IAM Role for authentication 20 | # Properly unmarshal DynamoDB JSON types. Binary NOT tested. 21 | 22 | 23 | # Load the mapping if it exist 24 | table_mapping = None 25 | if (os.path.isfile("lib/table_mapping.json")): 26 | with open('lib/table_mapping.json') as json_file: 27 | table_mapping = json.load(json_file) 28 | 29 | def lambda_handler(event, context): 30 | 31 | session = boto3.session.Session() 32 | credentials = session.get_credentials() 33 | 34 | # Get proper credentials for ES auth 35 | awsauth = AWS4Auth(credentials.access_key, 36 | credentials.secret_key, 37 | session.region_name, 'es', 38 | session_token=credentials.token) 39 | 40 | # Connect to ES 41 | es = Elasticsearch( 42 | [env.ES_ENDPOINT], 43 | http_auth=awsauth, 44 | use_ssl=True, 45 | verify_certs=True, 46 | connection_class=RequestsHttpConnection 47 | ) 48 | 49 | print("Cluster info:") 50 | print(es.info()) 51 | 52 | # Loop over the DynamoDB Stream records 53 | for record in event['Records']: 54 | 55 | try: 56 | if record['eventName'] == "INSERT": 57 | insert_document(es, record) 58 | elif record['eventName'] == "REMOVE": 59 | remove_document(es, record) 60 | elif record['eventName'] == "MODIFY": 61 | modify_document(es, record) 62 | 63 | except Exception as e: 64 | print("Failed to process:") 65 | print(json.dumps(record)) 66 | print("ERROR: " + repr(e)) 67 | continue 68 | 69 | # Process MODIFY events 70 | def modify_document(es, record): 71 | table = getTable(record) 72 | print("Dynamo Table: " + table) 73 | 74 | docId = generateId(record, table) 75 | print("KEY") 76 | print(docId) 77 | 78 | # Unmarshal the DynamoDB JSON to a normal JSON 79 | doc = json.dumps(unmarshalJson(record['dynamodb']['NewImage'])) 80 | 81 | print("Updated document:") 82 | print(doc) 83 | 84 | # We reindex the whole document as ES accepts partial docs 85 | es.index(index=table, 86 | body=doc, 87 | id=docId, 88 | doc_type=table, 89 | refresh=True) 90 | 91 | print("Successly modified - Index: " + table + " - Document ID: " + docId) 92 | 93 | # Process REMOVE events 94 | def remove_document(es, record): 95 | table = getTable(record) 96 | print("Dynamo Table: " + table) 97 | 98 | docId = generateId(record, table) 99 | print("Deleting document ID: " + docId) 100 | 101 | es.delete(index=table, 102 | id=docId, 103 | doc_type=table, 104 | refresh=True) 105 | 106 | print("Successly removed - Index: " + table + " - Document ID: " + docId) 107 | 108 | # Process INSERT events 109 | def insert_document(es, record): 110 | table = getTable(record) 111 | print("Dynamo Table: " + table) 112 | 113 | # Create index if missing 114 | if es.indices.exists(table) == False: 115 | print("Create missing index: " + table) 116 | 117 | es.indices.create(table, 118 | body='{"settings": { "index.mapping.coerce": true } }') 119 | 120 | print("Index created: " + table) 121 | 122 | # Unmarshal the DynamoDB JSON to a normal JSON 123 | doc = json.dumps(unmarshalJson(record['dynamodb']['NewImage'])) 124 | 125 | print("New document to Index:") 126 | print(doc) 127 | 128 | newId = generateId(record, table) 129 | es.index(index=table, 130 | body=doc, 131 | id=newId, 132 | doc_type=table, 133 | refresh=True) 134 | 135 | print("Successly inserted - Index: " + table + " - Document ID: " + newId) 136 | 137 | # Return the dynamoDB table that received the event. Lower case it 138 | def getTable(record): 139 | p = re.compile('arn:aws:dynamodb:.*?:.*?:table/([0-9a-zA-Z_-]+)/.+') 140 | m = p.match(record['eventSourceARN']) 141 | if m is None: 142 | raise Exception("Table not found in SourceARN") 143 | return m.group(1).lower() 144 | 145 | # Generate the ID for ES. Used for deleting or updating item later 146 | # By default using keys given by the dynamo stream 147 | # If a mapping is there, it's used to create the id 148 | def generateId(record, table_name): 149 | keys = unmarshalJson(record['dynamodb']['Keys']) 150 | if (table_mapping != None 151 | and table_name in table_mapping.keys()): 152 | print("Use mapping") 153 | if ("SortKey" in table_mapping[table_name]): 154 | return(str(keys[table_mapping[table_name]["PrimaryKey"]])+"|"+str(keys[table_mapping[table_name]["SortKey"]])) 155 | else: 156 | return(str(keys[table_mapping[table_name]["PrimaryKey"]])) 157 | 158 | # Concat HASH and RANGE key with | in between 159 | newId = "" 160 | i = 0 161 | for key, value in list(keys.items()): 162 | if (i > 0): 163 | newId += "|" 164 | newId += str(value) 165 | i += 1 166 | 167 | return newId 168 | 169 | # Unmarshal a JSON that is DynamoDB formatted 170 | def unmarshalJson(node): 171 | data = {} 172 | data["M"] = node 173 | return unmarshalValue(data, True) 174 | 175 | # ForceNum will force float or Integer to 176 | def unmarshalValue(node, forceNum=False): 177 | for key, value in list(node.items()): 178 | if (key == "NULL"): 179 | return None 180 | if (key == "S" or key == "BOOL"): 181 | return value 182 | if (key == "N"): 183 | if (forceNum): 184 | return int_or_float(value) 185 | return value 186 | if (key == "M"): 187 | data = {} 188 | for key1, value1 in list(value.items()): 189 | if key1 in reserved_fields: 190 | key1 = key1.replace("_", "__", 1) 191 | data[key1] = unmarshalValue(value1, True) 192 | return data 193 | if (key == "BS" or key == "L"): 194 | data = [] 195 | for item in value: 196 | data.append(unmarshalValue(item)) 197 | return data 198 | if (key == "SS"): 199 | data = [] 200 | for item in value: 201 | data.append(item) 202 | return data 203 | if (key == "NS"): 204 | data = [] 205 | for item in value: 206 | if (forceNum): 207 | data.append(int_or_float(item)) 208 | else: 209 | data.append(item) 210 | return data 211 | 212 | # Detect number type and return the correct one 213 | def int_or_float(s): 214 | try: 215 | return int(s) 216 | except ValueError: 217 | return float(s) 218 | -------------------------------------------------------------------------------- /update_mapping.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import re 4 | import boto3 5 | from lib import env 6 | from datetime import date, datetime 7 | 8 | # This script create a json used by the dynamo to eslasticseach lambda 9 | # More infomations about why how to use it into the README 10 | 11 | 12 | lamdba_client = boto3.client('lambda') 13 | ddb_client = boto3.client('dynamodb') 14 | 15 | response = lamdba_client.list_event_source_mappings( 16 | FunctionName='DynamoToES', 17 | MaxItems=100 18 | ) 19 | 20 | def json_serial(obj): 21 | """JSON serializer for objects not serializable by default json code""" 22 | 23 | if isinstance(obj, (datetime, date)): 24 | return obj.isoformat() 25 | raise TypeError ("Type %s not serializable" % type(obj)) 26 | 27 | table_list = { 28 | re.search(".+:table\/([a-zA-Z]+)\/.+", event_source["EventSourceArn"]).group(1) : event_source 29 | for event_source in response["EventSourceMappings"] 30 | } 31 | 32 | table_mapping = { 33 | table_name.lower() : ddb_client.describe_table(TableName=table_name) 34 | for (table_name, table) in table_list.items() 35 | } 36 | for table_name, table_description in table_mapping.items(): 37 | temp_key_schema = table_description["Table"]["KeySchema"]; 38 | 39 | primary_key = ""; 40 | for value in temp_key_schema: 41 | primary_key = value["AttributeName"] if value["KeyType"] == "HASH" else primary_key 42 | 43 | sort_key = ""; 44 | for value in temp_key_schema: 45 | sort_key = value["AttributeName"] if value["KeyType"] == "RANGE" else sort_key 46 | 47 | table_description["PrimaryKey"] = primary_key 48 | if(sort_key != ""): 49 | table_description["SortKey"] = sort_key 50 | 51 | f = open("lib/table_mapping.json", "w") 52 | f.write(json.dumps(table_mapping, default=json_serial,indent=4, sort_keys=True)) 53 | f.close() 54 | --------------------------------------------------------------------------------