├── tests ├── __init__.py └── unit │ ├── __init__.py │ └── test_ingestion_app_cdk_stack.py ├── ingestion_app_cdk ├── __init__.py └── ingestion_app_cdk_stack.py ├── requirements-dev.txt ├── requirements.txt ├── .gitignore ├── scripts └── faker_hr.py ├── source.bat ├── app.py ├── lambda └── process_files │ └── ingestion_batch.py ├── README.md └── cdk.json /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ingestion_app_cdk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==6.2.5 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib==2.123.0 2 | constructs>=10.0.0,<11.0.0 3 | 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | package-lock.json 3 | __pycache__ 4 | .pytest_cache 5 | .venv 6 | *.egg-info 7 | 8 | # CDK asset staging directory 9 | .cdk.staging 10 | cdk.out 11 | 12 | data/ 13 | 14 | .DS_Store 15 | 16 | .env 17 | .env.production 18 | .env.staging 19 | 20 | .idea/ 21 | 22 | 23 | -------------------------------------------------------------------------------- /scripts/faker_hr.py: -------------------------------------------------------------------------------- 1 | from faker import Faker 2 | import random 3 | 4 | fake = Faker() 5 | for _ in range(10): 6 | with open(f"data/employee_{random.randint(1, 100)}.txt", "w") as f: 7 | f.write(f"Name: {fake.name()}\n") 8 | f.write(f"Position: {fake.job()}\n") 9 | f.write(f"Performance: {random.choice(['Excellent', 'Good', 'Average', 'Poor'])}\n") 10 | 11 | -------------------------------------------------------------------------------- /source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /tests/unit/test_ingestion_app_cdk_stack.py: -------------------------------------------------------------------------------- 1 | import aws_cdk as core 2 | import aws_cdk.assertions as assertions 3 | 4 | from ingestion_app_cdk.ingestion_app_cdk_stack import IngestionAppCdkStack 5 | 6 | # example tests. To run these tests, uncomment this file along with the example 7 | # resource in ingestion_app_cdk/ingestion_app_cdk_stack.py 8 | def test_sqs_queue_created(): 9 | app = core.App() 10 | stack = IngestionAppCdkStack(app, "ingestion-app-cdk") 11 | template = assertions.Template.from_stack(stack) 12 | 13 | # template.has_resource_properties("AWS::SQS::Queue", { 14 | # "VisibilityTimeout": 300 15 | # }) 16 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | import aws_cdk as cdk 5 | from aws_cdk import App 6 | from ingestion_app_cdk.ingestion_app_cdk_stack import IngestionAppCdkStack 7 | 8 | 9 | 10 | 11 | app = cdk.App() 12 | IngestionAppCdkStack(app, "IngestionAppCdkStack", 13 | # If you don't specify 'env', this stack will be environment-agnostic. 14 | # Account/Region-dependent features and context lookups will not work, 15 | # but a single synthesized template can be deployed anywhere. 16 | 17 | # Uncomment the next line to specialize this stack for the AWS Account 18 | # and Region that are implied by the current CLI configuration. 19 | 20 | #env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')), 21 | 22 | # Uncomment the next line if you know exactly what Account and Region you 23 | # want to deploy the stack to. */ 24 | 25 | #env=cdk.Environment(account='123456789012', region='us-east-1'), 26 | 27 | # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html 28 | ) 29 | 30 | app.synth() 31 | -------------------------------------------------------------------------------- /lambda/process_files/ingestion_batch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | import base64 4 | 5 | def handler(event, context): 6 | # 1. Get environment variables 7 | bucket_name = os.environ['BUCKET_NAME'] 8 | instance_id = os.environ['INSTANCE_ID'] # Pass EC2 instance ID here 9 | target_directory = os.environ['TARGET_DIRECTORY'] 10 | 11 | # 2. Initialize AWS SDK clients 12 | s3 = boto3.client('s3') 13 | ssm = boto3.client('ssm') 14 | 15 | print(f"Processing files from bucket: {bucket_name} to EC2 instance: {instance_id}") 16 | 17 | # 3. Process each file upload event from S3 18 | for record in event['Records']: 19 | key = record['s3']['object']['key'] 20 | print(f"File uploaded: {key}") 21 | 22 | # 4. Download file content from S3 23 | response = s3.get_object(Bucket=bucket_name, Key=key) 24 | content = response['Body'].read() 25 | 26 | # 5. Encode the content in base64 27 | encoded_content = base64.b64encode(content).decode('utf-8') 28 | 29 | # 6. Define the remote path and command 30 | remote_path = f"{target_directory}/{key}" 31 | command = f'echo "{encoded_content}" | base64 -d > {remote_path}' 32 | 33 | print(f"Sending file {key} to {remote_path} on EC2 instance.") 34 | 35 | # 7. Send command to EC2 via SSM 36 | try: 37 | response = ssm.send_command( 38 | InstanceIds=[instance_id], 39 | DocumentName="AWS-RunShellScript", 40 | Parameters={"commands": [command]}, 41 | ) 42 | command_id = response['Command']['CommandId'] 43 | print(f"Command sent successfully. Command ID: {command_id}") 44 | except Exception as e: 45 | print(f"Failed to send command for file {key}: {str(e)}") 46 | 47 | return { 48 | 'statusCode': 200, 49 | 'body': 'Files processed successfully' 50 | } 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Data Transfer using AWS CDK Python! 3 | 4 | This application creates an AWS Stack in the cloud to move files from an s3 bucket to an ec2 folder 5 | uses S3 Bucket Events & Lambda to send data to an ec2 server 6 | 7 | 8 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 9 | 10 | This project is set up like a standard Python project. The initialization 11 | process also creates a virtualenv within this project, stored under the `.venv` 12 | directory. To create the virtualenv it assumes that there is a `python3` 13 | (or `python` for Windows) executable in your path with access to the `venv` 14 | package. If for any reason the automatic creation of the virtualenv fails, 15 | you can create the virtualenv manually. 16 | 17 | To manually create a virtualenv on MacOS and Linux: 18 | 19 | ``` 20 | $ python3 -m venv .venv 21 | ``` 22 | 23 | After the init process completes and the virtualenv is created, you can use the following 24 | step to activate your virtualenv. 25 | 26 | ``` 27 | $ source .venv/bin/activate 28 | ``` 29 | 30 | If you are a Windows platform, you would activate the virtualenv like this: 31 | 32 | ``` 33 | % .venv\Scripts\activate.bat 34 | ``` 35 | 36 | Once the virtualenv is activated, you can install the required dependencies. 37 | 38 | ``` 39 | $ pip install -r requirements.txt 40 | ``` 41 | 42 | At this point you can now synthesize the CloudFormation template for this code. 43 | 44 | ``` 45 | $ cdk synth 46 | ``` 47 | 48 | To add additional dependencies, for example other CDK libraries, just add 49 | them to your `setup.py` file and rerun the `pip install -r requirements.txt` 50 | command. 51 | 52 | ## Useful commands 53 | 54 | * `cdk ls` list all stacks in the app 55 | * `cdk synth` emits the synthesized CloudFormation template 56 | * `cdk deploy` deploy this stack to your default AWS account/region 57 | * `cdk diff` compare deployed stack with current state 58 | * `cdk docs` open CDK documentation 59 | 60 | Enjoy! 61 | -------------------------------------------------------------------------------- /cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "**/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 19 | "@aws-cdk/core:checkSecretUsage": true, 20 | "@aws-cdk/core:target-partitions": [ 21 | "aws", 22 | "aws-cn" 23 | ], 24 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 25 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 26 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 27 | "@aws-cdk/aws-iam:minimizePolicies": true, 28 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 29 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 30 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 31 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 32 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 33 | "@aws-cdk/core:enablePartitionLiterals": true, 34 | "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, 35 | "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, 36 | "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, 37 | "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, 38 | "@aws-cdk/aws-route53-patters:useCertificate": true, 39 | "@aws-cdk/customresources:installLatestAwsSdkDefault": false, 40 | "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, 41 | "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, 42 | "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, 43 | "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, 44 | "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, 45 | "@aws-cdk/aws-redshift:columnId": true, 46 | "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, 47 | "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, 48 | "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, 49 | "@aws-cdk/aws-kms:aliasNameRef": true, 50 | "@aws-cdk/aws-autoscaling:generateLaunchTemplateInsteadOfLaunchConfig": true, 51 | "@aws-cdk/core:includePrefixInUniqueNameGeneration": true, 52 | "@aws-cdk/aws-efs:denyAnonymousAccess": true, 53 | "@aws-cdk/aws-opensearchservice:enableOpensearchMultiAzWithStandby": true, 54 | "@aws-cdk/aws-lambda-nodejs:useLatestRuntimeVersion": true, 55 | "@aws-cdk/aws-efs:mountTargetOrderInsensitiveLogicalId": true, 56 | "@aws-cdk/aws-rds:auroraClusterChangeScopeOfInstanceParameterGroupWithEachParameters": true, 57 | "@aws-cdk/aws-appsync:useArnForSourceApiAssociationIdentifier": true, 58 | "@aws-cdk/aws-rds:preventRenderingDeprecatedCredentials": true, 59 | "@aws-cdk/aws-codepipeline-actions:useNewDefaultBranchForCodeCommitSource": true, 60 | "@aws-cdk/aws-cloudwatch-actions:changeLambdaPermissionLogicalIdForLambdaAction": true, 61 | "@aws-cdk/aws-codepipeline:crossAccountKeysDefaultValueToFalse": true, 62 | "@aws-cdk/aws-codepipeline:defaultPipelineTypeToV2": true, 63 | "@aws-cdk/aws-kms:reduceCrossAccountRegionPolicyScope": true, 64 | "@aws-cdk/aws-eks:nodegroupNameAttribute": true, 65 | "@aws-cdk/aws-ec2:ebsDefaultGp3Volume": true, 66 | "@aws-cdk/aws-ecs:removeDefaultDeploymentAlarm": true, 67 | "@aws-cdk/custom-resources:logApiResponseDataPropertyTrueDefault": false, 68 | "@aws-cdk/aws-s3:keepNotificationInImportedBucket": false, 69 | "@aws-cdk/aws-ecs:reduceEc2FargateCloudWatchPermissions": true, 70 | "@aws-cdk/aws-ec2:ec2SumTImeoutEnabled": true, 71 | "@aws-cdk/aws-appsync:appSyncGraphQLAPIScopeLambdaPermission": true, 72 | "@aws-cdk/aws-rds:setCorrectValueForDatabaseInstanceReadReplicaInstanceResourceId": true, 73 | "@aws-cdk/core:cfnIncludeRejectComplexResourceUpdateCreatePolicyIntrinsics": true, 74 | "@aws-cdk/aws-lambda-nodejs:sdkV3ExcludeSmithyPackages": true 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /ingestion_app_cdk/ingestion_app_cdk_stack.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import Stack, RemovalPolicy 2 | from aws_cdk import aws_s3 as s3 3 | from aws_cdk import aws_lambda as _lambda 4 | from aws_cdk import aws_iam as iam 5 | from aws_cdk import aws_s3_notifications as s3n 6 | from aws_cdk import aws_ec2 as ec2 7 | from constructs import Construct 8 | 9 | class IngestionAppCdkStack(Stack): 10 | def __init__(self, scope: Construct, id: str, **kwargs) -> None: 11 | super().__init__(scope, id, **kwargs) 12 | 13 | # Create the VPC once and reuse it 14 | vpc = self.create_vpc() 15 | 16 | # S3 Bucket for uploading HR files 17 | bucket = s3.Bucket(self, "HRDocumentsS3Bucket", 18 | versioned=True, 19 | removal_policy=RemovalPolicy.DESTROY, 20 | auto_delete_objects=True 21 | ) 22 | 23 | # EC2 Instance Role 24 | ec2_role = iam.Role(self, "EC2S3AccessRole", 25 | assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"), 26 | managed_policies=[ 27 | iam.ManagedPolicy.from_aws_managed_policy_name("AmazonS3FullAccess"), 28 | iam.ManagedPolicy.from_aws_managed_policy_name("CloudWatchFullAccess") 29 | ] 30 | ) 31 | 32 | # EC2 Security Group with SSH ingress rule 33 | ec2_sg = ec2.SecurityGroup(self, "HREC2SecurityGroup", 34 | vpc=vpc, 35 | allow_all_outbound=True 36 | ) 37 | # ec2_sg.add_ingress_rule( 38 | # ec2.Peer.any_ipv4(), 39 | # ec2.Port.tcp(22), 40 | # "Allow SSH access from anywhere" 41 | # ) 42 | 43 | # EC2 Instance with SSM 44 | ec2_instance = ec2.Instance( 45 | self, 46 | "HRFileProcessingInstance", 47 | instance_type=ec2.InstanceType.of( 48 | ec2.InstanceClass.T2, 49 | ec2.InstanceSize.MICRO 50 | ), 51 | machine_image=ec2.MachineImage.latest_amazon_linux2023(), 52 | vpc=vpc, 53 | key_name="AS-RAG", 54 | security_group=ec2_sg, 55 | role=iam.Role( 56 | self, 57 | "SSMRole", 58 | assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"), 59 | managed_policies=[ 60 | iam.ManagedPolicy.from_aws_managed_policy_name("AmazonSSMManagedInstanceCore") 61 | ] 62 | ), 63 | ) 64 | # EC2 with ssh 65 | # ec2_instance = ec2.Instance( 66 | # self, 67 | # "HRFileProcessingInstance", 68 | # instance_type=ec2.InstanceType.of( 69 | # ec2.InstanceClass.T2, 70 | # ec2.InstanceSize.MICRO 71 | # ), 72 | # machine_image=ec2.MachineImage.latest_amazon_linux(), 73 | # vpc=vpc, 74 | # vpc_subnets=ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), # Ensures public subnet 75 | # security_group=ec2_sg, 76 | # key_name="AS-RAG", # Replace with your actual key pair name 77 | # role=ec2_role 78 | # ) 79 | 80 | # Lambda function to process files 81 | process_files_lambda = _lambda.Function( 82 | self, 83 | "HRIngestionBatch", 84 | runtime=_lambda.Runtime.PYTHON_3_9, 85 | handler="ingestion_batch.handler", # Ensure this matches the correct function inside ingestion_batch.py 86 | code=_lambda.Code.from_asset("lambda/process_files"), 87 | environment={ 88 | "BUCKET_NAME": bucket.bucket_name, 89 | "INSTANCE_ID": ec2_instance.instance_id, 90 | "TARGET_DIRECTORY": "/home/ec2-user/ingested_files" 91 | } 92 | ) 93 | 94 | # Add IAM policy to the Lambda's role 95 | process_files_lambda.add_to_role_policy( 96 | iam.PolicyStatement( 97 | actions=["ssm:StartSession", "ssm:SendCommand"], 98 | resources=["*"] 99 | ) 100 | ) 101 | 102 | 103 | # Grant Lambda access to S3 bucket 104 | bucket.grant_read_write(process_files_lambda) 105 | 106 | # Notify Lambda on file upload to S3 107 | bucket.add_event_notification( 108 | s3.EventType.OBJECT_CREATED, 109 | s3n.LambdaDestination(process_files_lambda) 110 | ) 111 | 112 | # Grant EC2 access to S3 bucket 113 | bucket.grant_read_write(ec2_instance.role) 114 | 115 | def create_vpc(self): 116 | return ec2.Vpc(self, "HR_VPC", max_azs=2) 117 | --------------------------------------------------------------------------------