├── .vscode ├── .ropeproject │ └── config.py └── settings.json ├── Blogs_Code ├── Airflow_Configuration.MD ├── DAG │ ├── Readme.md │ ├── lambda_hook_invoke.py │ └── lambda_invoke.py ├── New Text Document.txt ├── record_transform.py ├── sam_deploy.zip ├── send_records.py └── win_feature.PNG ├── Contents ├── Dataset.MD ├── Pipelines.MD ├── Tools.MD └── Tools_Detail.MD ├── Images ├── RBAC.png ├── daily_batch.jpg ├── data_view.PNG ├── dtree.PNG ├── packagetree.PNG ├── readme.MD ├── stream_processing.jpg ├── testresult.PNG ├── tools_used.jpg ├── ubuntu.PNG ├── ui1.png ├── ui2.png ├── ui3.png ├── visual.gif └── win_feature.PNG ├── Pipelines ├── README.md ├── README.md.template.txt ├── dynamo.py ├── kinesis_to_s3.py ├── s3_to_kinesis.py └── web_scrape_jobs.py /.vscode/.ropeproject/config.py: -------------------------------------------------------------------------------- 1 | # The default ``config.py`` 2 | # flake8: noqa 3 | 4 | 5 | def set_prefs(prefs): 6 | """This function is called before opening the project""" 7 | 8 | # Specify which files and folders to ignore in the project. 9 | # Changes to ignored resources are not added to the history and 10 | # VCSs. Also they are not returned in `Project.get_files()`. 11 | # Note that ``?`` and ``*`` match all characters but slashes. 12 | # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' 13 | # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' 14 | # '.svn': matches 'pkg/.svn' and all of its children 15 | # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' 16 | # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' 17 | prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject', 18 | '.hg', '.svn', '_svn', '.git', '.tox'] 19 | 20 | # Specifies which files should be considered python files. It is 21 | # useful when you have scripts inside your project. Only files 22 | # ending with ``.py`` are considered to be python files by 23 | # default. 24 | #prefs['python_files'] = ['*.py'] 25 | 26 | # Custom source folders: By default rope searches the project 27 | # for finding source folders (folders that should be searched 28 | # for finding modules). You can add paths to that list. Note 29 | # that rope guesses project source folders correctly most of the 30 | # time; use this if you have any problems. 31 | # The folders should be relative to project root and use '/' for 32 | # separating folders regardless of the platform rope is running on. 33 | # 'src/my_source_folder' for instance. 34 | #prefs.add('source_folders', 'src') 35 | 36 | # You can extend python path for looking up modules 37 | #prefs.add('python_path', '~/python/') 38 | 39 | # Should rope save object information or not. 40 | prefs['save_objectdb'] = True 41 | prefs['compress_objectdb'] = False 42 | 43 | # If `True`, rope analyzes each module when it is being saved. 44 | prefs['automatic_soa'] = True 45 | # The depth of calls to follow in static object analysis 46 | prefs['soa_followed_calls'] = 0 47 | 48 | # If `False` when running modules or unit tests "dynamic object 49 | # analysis" is turned off. This makes them much faster. 50 | prefs['perform_doa'] = True 51 | 52 | # Rope can check the validity of its object DB when running. 53 | prefs['validate_objectdb'] = True 54 | 55 | # How many undos to hold? 56 | prefs['max_history_items'] = 32 57 | 58 | # Shows whether to save history across sessions. 59 | prefs['save_history'] = True 60 | prefs['compress_history'] = False 61 | 62 | # Set the number spaces used for indenting. According to 63 | # :PEP:`8`, it is best to use 4 spaces. Since most of rope's 64 | # unit-tests use 4 spaces it is more reliable, too. 65 | prefs['indent_size'] = 4 66 | 67 | # Builtin and c-extension modules that are allowed to be imported 68 | # and inspected by rope. 69 | prefs['extension_modules'] = [] 70 | 71 | # Add all standard c-extensions to extension_modules list. 72 | prefs['import_dynload_stdmods'] = True 73 | 74 | # If `True` modules with syntax errors are considered to be empty. 75 | # The default value is `False`; When `False` syntax errors raise 76 | # `rope.base.exceptions.ModuleSyntaxError` exception. 77 | prefs['ignore_syntax_errors'] = False 78 | 79 | # If `True`, rope ignores unresolvable imports. Otherwise, they 80 | # appear in the importing namespace. 81 | prefs['ignore_bad_imports'] = False 82 | 83 | # If `True`, rope will insert new module imports as 84 | # `from import ` by default. 85 | prefs['prefer_module_from_imports'] = False 86 | 87 | # If `True`, rope will transform a comma list of imports into 88 | # multiple separate import statements when organizing 89 | # imports. 90 | prefs['split_imports'] = False 91 | 92 | # If `True`, rope will remove all top-level import statements and 93 | # reinsert them at the top of the module when making changes. 94 | prefs['pull_imports_to_top'] = True 95 | 96 | # If `True`, rope will sort imports alphabetically by module name instead of 97 | # alphabetically by import statement, with from imports after normal 98 | # imports. 99 | prefs['sort_imports_alphabetically'] = False 100 | 101 | # Location of implementation of rope.base.oi.type_hinting.interfaces.ITypeHintingFactory 102 | # In general case, you don't have to change this value, unless you're an rope expert. 103 | # Change this value to inject you own implementations of interfaces 104 | # listed in module rope.base.oi.type_hinting.providers.interfaces 105 | # For example, you can add you own providers for Django Models, or disable the search 106 | # type-hinting in a class hierarchy, etc. 107 | prefs['type_hinting_factory'] = 'rope.base.oi.type_hinting.factory.default_type_hinting_factory' 108 | 109 | 110 | def project_opened(project): 111 | """This function is called after opening the project""" 112 | # Do whatever you like here! 113 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.linting.enabled": true 3 | } -------------------------------------------------------------------------------- /Blogs_Code/Airflow_Configuration.MD: -------------------------------------------------------------------------------- 1 | # Apache Airflow Configuration 2 | 3 | I configured Airflow on windows desktop by enabling the Linux subsystem and downloading the Ubuntu app from Microsoft store. 4 | This can also be achieved on any kind of Linux flavor running platform( may be slight change in commands) 5 | ## Table of Contents 6 | 7 | - [Enable Linux Subsystem](#Enable-Linux-Subsystem) 8 | - [Download ubuntu from Microsoft store](#Download-ubuntu-from-Microsoft-store) 9 | - [Prerequisites](#Prerequisites) 10 | - [Virtual Environment](#Virtual-Environment) 11 | - [Airflow Installation](#Airflow-Installation) 12 | - [Creating Airflow home](#Creating-Airflow-home) 13 | - [Initialize Airflow DB](#Initialize-Airflow-DB) 14 | - [Start Airflow Scheduler](#Start-Airflow-Scheduler) 15 | 16 | ## Enable Linux Subsystem 17 | ### windows > Turn Windows features on or off 18 | 19 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/win_feature.PNG) 20 | 21 | enable the highlighted feature 22 | 23 | 24 | ## Download ubuntu from Microsoft store 25 | 26 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/ubuntu.PNG) 27 | 28 | Launching app take you to bash terminal, run below commands 29 | 30 | 31 | ## Prerequisites 32 | 33 | update Ubuntu with latest packages 34 | 35 | ```console 36 | kotharv@vijay-dell:~$ sudo apt-get update 37 | kotharv@vijay-dell:~$ sudo apt-get upgrade 38 | kotharv@vijay-dell:~$ sudo su 39 | ``` 40 | install `pip` for downloading python modules 41 | `virtualenv` to create a virtual space for Airflow 42 | 43 | ```console 44 | root@vijay-dell:/home/kotharv# apt install python-pip 45 | root@vijay-dell:/home/kotharv# sudo apt install python3-pip 46 | root@vijay-dell:/home/kotharv# apt install virtualenv 47 | ``` 48 | ## Virtual Environment 49 | ```console 50 | root@vijay-dell:/home/kotharv# cd /mnt/e/airflow/workspace/ 51 | root@vijay-dell:/mnt/e/airflow/workspace# virtualenv -p `which python3` venv 52 | root@vijay-dell:/mnt/e/airflow/workspace# source venv/bin/activate 53 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# 54 | ``` 55 | 56 | ## Airflow Installation 57 | ```console 58 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# pip install apache-airflow 59 | ``` 60 | ## Creating Airflow home 61 | 62 | Tricky part is run ` cd /mnt ` you will be able to see the windows drive partitions and make a directory for Airflow, 63 | so that all the Airflow configuration stays at dedicated directory 64 | 65 | ```console 66 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# mkdir airflow_home 67 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# export AIRFLOW_HOME=`pwd`/airflow_home 68 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# echo $AIRFLOW_HOME 69 | /mnt/e/airflow/workspace/airflow_home 70 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# airflow version 71 | 1.10.10 72 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# cd airflow_home/ 73 | (venv) root@vijay-dell:/mnt/e/airflow/workspace/airflow_home# ls 74 | airflow.cfg logs unittests.cfg 75 | ``` 76 | Above commands with set your environment variable *AIRFLOW_HOME*. 77 | 78 | 79 | Running `airflow version` command will generate config files in airflow home directory, it's kind of a test case to test your environment variable. 80 | ## Initialize Airflow DB 81 | 82 | You can also configure MySQL or Postgres as back end database for your Airflow web application. 83 | 84 | By default, SQL Lite will be configured. It will generate .mdb file in your home directory. 85 | ```console 86 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# airflow initdb 87 | Start Airflow Webserver 88 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# airflow webserver 89 | ``` 90 | ## Start Airflow Scheduler 91 | 92 | When you create a new DAG, start scheduler to update Airflow dashboard with latest jobs. 93 | 94 | ```console 95 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# airflow scheduler 96 | ``` 97 | -------------------------------------------------------------------------------- /Blogs_Code/DAG/Readme.md: -------------------------------------------------------------------------------- 1 | # Dynamic Acyclic Graph ( DAG ) sample python code 2 | 3 | 4 | *lambda_invoke.py* sample code with BOTO3 5 | 6 | 7 | *lambda_hook_invoke.py* sample code with AWS hook 8 | -------------------------------------------------------------------------------- /Blogs_Code/DAG/lambda_hook_invoke.py: -------------------------------------------------------------------------------- 1 | # The DAG object; we'll need this to instantiate a DAG 2 | from airflow import DAG 3 | # Operators; we need this to operate! 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators.python_operator import PythonOperator 6 | from airflow.operators.bash_operator import BashOperator 7 | # Datetime and other 8 | from datetime import datetime 9 | import boto3, json 10 | # Importing airflow hook 11 | from airflow.contrib.hooks.aws_lambda_hook import AwsLambdaHook 12 | 13 | # Default arguments and can be overwritten at operator initialization 14 | default_args = { 15 | 'owner': 'Vijay', 16 | 'depends_on_past': False, 17 | 'start_date': datetime(2020,5,1), 18 | 'email': ['@gmail.com'], 19 | 'email_on_failure': False, 20 | 'email_on_retry': False, 21 | 'retries': 1, 22 | } 23 | 24 | 25 | 26 | # Function 27 | 28 | def lambda1(ds,**kwargs): 29 | 30 | hook = AwsLambdaHook('myAirflowTest', region_name='', log_type='None',qualifier='$LATEST',invocation_type='RequestResponse',config=None,aws_conn_id='my_lambda') 31 | response_1 = hook.invoke_lambda(payload='null') 32 | print ('Response--->' , response_1) 33 | 34 | # Task 35 | 36 | # Using the context manager alllows you not to duplicate the dag parameter in each operator 37 | with DAG('invocation_hook_lambda', default_args=default_args, description='invoke a lambda in dev aws instance') as dag: 38 | 39 | start = DummyOperator(task_id='Begin_execution') 40 | 41 | t1 = PythonOperator( 42 | task_id="lambda1", 43 | python_callable=lambda1, 44 | provide_context=True 45 | ) 46 | 47 | t2 = BashOperator( 48 | task_id="run_with_lambda", 49 | bash_command="echo 1" 50 | ) 51 | 52 | t3 = BashOperator( 53 | task_id="run_with_lambda2", 54 | bash_command="echo 1" 55 | ) 56 | end = DummyOperator(task_id='stop_execution') 57 | 58 | 59 | 60 | start >> [t1, t2] >> t3 >>end 61 | -------------------------------------------------------------------------------- /Blogs_Code/DAG/lambda_invoke.py: -------------------------------------------------------------------------------- 1 | # The DAG object; we'll need this to instantiate a DAG 2 | from airflow import DAG 3 | # Operators; we need this to operate! 4 | from airflow.operators.dummy_operator import DummyOperator 5 | from airflow.operators.python_operator import PythonOperator 6 | # Datetime and other 7 | from datetime import datetime 8 | import boto3, json 9 | 10 | # Default arguments and can be overwritten at operator initialization 11 | default_args = { 12 | 'owner': 'Vijay', 13 | 'depends_on_past': False, 14 | 'start_date': datetime(2020,4,17), 15 | 'email': ['@gmail.com'], 16 | 'email_on_failure': False, 17 | 'email_on_retry': False, 18 | 'retries': 1, 19 | } 20 | 21 | # DAG initialization 22 | dag = DAG( 23 | 'invocation_lambda', 24 | default_args=default_args, 25 | description='invoke a lambda in dev aws instance' 26 | ) 27 | 28 | # Function 29 | 30 | def lambda1(ds,**kwargs): 31 | lambda_client = boto3.client('lambda', 32 | region_name='', 33 | aws_access_key_id='', 34 | aws_secret_access_key='') 35 | response_1 = lambda_client.invoke(FunctionName='myAirflowTest',InvocationType='RequestResponse') 36 | print ('Response--->' , response_1) 37 | 38 | # Task 39 | 40 | start = DummyOperator(task_id='Begin_execution', dag=dag) 41 | 42 | t1 = PythonOperator( 43 | task_id="lambda1", 44 | python_callable=lambda1, 45 | provide_context=True, 46 | dag=dag 47 | ) 48 | 49 | end = DummyOperator(task_id='stop_execution', dag=dag) 50 | 51 | start >> t1 >> end -------------------------------------------------------------------------------- /Blogs_Code/New Text Document.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Blogs_Code/New Text Document.txt -------------------------------------------------------------------------------- /Blogs_Code/record_transform.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | from datetime import datetime 4 | 5 | # Incoming Event 6 | def lambda_handler(event, context): 7 | output = [] 8 | output_record = {} 9 | 10 | # Loop through records in incoming Event 11 | for record in event['records']: 12 | 13 | # Extract message 14 | 15 | message = base64.b64decode(record['data']).decode("UTF-8") 16 | 17 | # Construct output and add to the list 18 | 19 | rec = json.loads(message) 20 | rec['l3'] = 19 21 | 22 | output_record = { 23 | 'recordId': record['recordId'], 24 | 'result': 'Ok', 25 | 'data': base64.b64encode(bytes(json.dumps(rec), 'utf-8'))} 26 | output.append(output_record) 27 | 28 | return {'records': output} 29 | -------------------------------------------------------------------------------- /Blogs_Code/sam_deploy.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Blogs_Code/sam_deploy.zip -------------------------------------------------------------------------------- /Blogs_Code/send_records.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | from urllib.parse import unquote_plus 4 | import time 5 | import csv 6 | import base64 7 | 8 | s3 = boto3.client('s3') 9 | kinesis = boto3.client('kinesis') 10 | 11 | def lambda_handler(event, context): 12 | my_records = [] 13 | 14 | 15 | if event: 16 | # Read bucketname, filename from event record 17 | 18 | file_obj = event["Records"][0] 19 | bucketname = file_obj['s3']['bucket']['name'] 20 | filename = file_obj['s3']['object']['key'] 21 | 22 | # Get referenc to file on s3 bucket 23 | 24 | fileObj = s3.get_object(Bucket=bucketname, Key=filename) 25 | 26 | # Reading data with decoding bytes 27 | 28 | file_content = fileObj["Body"].read().decode('utf-8') 29 | 30 | # Split to extact each record to a list 31 | 32 | file_rows = file_content.split() 33 | 34 | # prepare the record and add to record list 35 | 36 | for row in range(len(file_rows)): 37 | pk = 'vjcool'+str(row) 38 | record = { 39 | 'Data': bytes(file_rows[row], 'utf-8'), 40 | 41 | 'PartitionKey': pk 42 | } 43 | my_records.append(record) 44 | 45 | 46 | # put the record to kinesis 47 | 48 | kinesis.put_records(Records= my_records, StreamName='my_cc_data') 49 | 50 | 51 | 52 | 53 | return "thanks" -------------------------------------------------------------------------------- /Blogs_Code/win_feature.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Blogs_Code/win_feature.PNG -------------------------------------------------------------------------------- /Contents/Dataset.MD: -------------------------------------------------------------------------------- 1 | # About My Dataset 2 | I used credit card complaints data set for my project. It has 18 columns and 80k rows. Columns are of Date, Text and Number types. The dataset is a CSV file. 3 | You can find the dataset here, [Credit Card Complaints]( https://data.world/dataquest/bank-and-credit-card-complaints) 4 | 5 | have a look at quick view of dataset, 6 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/data_view.PNG) 7 | # Why i choose it? 8 | When I started to peruse Data Engineering, I did a research on industry based on jobs available on job portal. Banking and Telecommunication are the top two sectors posting jobs locally. It should be relevant to do hands on project relating to these industries and that made made me to narrow down the data set. 9 | # What i like in the dataset? 10 | Its a good candidate to draw some insights and calculate statistics like average, total complaints group by location and time series analysis. 11 | # What is Problematic? 12 | One of the column 'Sub-Issue' doesn't have any values, we need to manually impute values. Imputation can be based on context of dataset usage. 13 | Eg: In my case I collected the Sub-Issue types and populated the column randomly. 14 | # What i want to achieve? 15 | I will use this data as a source for my data pipelines to address some of the problem statements 16 | 1. Analytics with Steaming data 17 | -------------------------------------------------------------------------------- /Contents/Pipelines.MD: -------------------------------------------------------------------------------- 1 | * [Pipelines](#Pipelines) 2 | - [Stream Processing](#stream-processing) 3 | - [Processing Data Stream](#processing-data-stream) 4 | - [Storing Data Stream](#storing-data-stream) 5 | - [Batch Processing](#Batch-processing) 6 | - [Visualization](#visualization) 7 | # Pipelines 8 | 9 | Data Analytics is derived form various data sources and intermediate transformations. Pipeline is created with *connect, storage, process, buffer and visualization* elements used to achieve organizational data goal. 10 | ## Stream Processing 11 | 12 | Steam Processing is analyzing the data in real time. Log from application server, click stream from websites, mobile apps and IOT learn about application, products and customers with the help of streaming data solutions. 13 | 14 | I created a pipe line to visualize credit cards complaints stream. 15 | 16 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/stream_processing.jpg) 17 | 18 | Let me take you through the solution, 19 | 20 | ### Connect 21 | 22 | I manually crated a python script to upload JSON file to AWS S3 bucket, which acts as my connect stream data source. 23 | 24 | ### Processing Data Stream 25 | 26 | AWS lambda function is invoked for S3 bucket object cerated event. File content will be transferred as record to Kinesis stream. 27 | 28 | ### Buffer 29 | 30 | Kinesis Data Stream acts as a buffer for real time data and Kinesis firehouse is my consumer, which acts as data transformation service and delivers data to Amazon Redshift. 31 | 32 | ### Storing Data Stream 33 | 34 | Data is eventually stored in a Redshift, which is connected to analytics tools. 35 | 36 | 37 | ## Batch Processing 38 | 39 | To eliminate the overhead of normalization the data has been stored to DynamoDB. A daily batch process will load the data into SQL database for visualization or predictive analytics. 40 | 41 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/daily_batch.jpg) 42 | 43 | ## Visualization 44 | 45 | Power BI desktop has been configured to AWS redshift data source to answer some of the analytical queries for business. 46 | 47 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/visual.gif) 48 | -------------------------------------------------------------------------------- /Contents/Tools.MD: -------------------------------------------------------------------------------- 1 | * [Tools Used](#Tools-Used) 2 | - [AWS](#AWS-products-used) 3 | - [Docker](#Docker) 4 | - [Apache Airflow](#Apache-Airflow) 5 | - [Power BI](#Power-BI) 6 | - [Coding](#Coding) 7 | - [IDE](#IDE) 8 | * [How they work](#How-they-work) 9 | - [Lambda and S3](#Lambda-and-s3) 10 | - [Kinesis and Dynamodb](#Kinesis-and-Dynamodb) 11 | - [EC2, RDS](#EC2,-RDS) 12 | * [Tools set up method](#Tools-set-up-methods) 13 | # Tools Used 14 | 15 | My technical stack decision was made based on the results of researching the job postings. AWS skills were on demand on most of the job description and it is used widely around the world. 16 | ### AWS products used 17 | * Amazon S3 18 | * Amazon Kinesis Data Streams 19 | * AWS Lambda 20 | * Dynamo DB 21 | * Amazon RDS 22 | * Amazon EC2 23 | 24 | ### Docker 25 | Docker is becoming more popular for its easy to use image concept. Literally you can get any kind of operating environment by, just pulling a container image on to your host OS. 26 | 27 | ### Apache Airflow 28 | To Orchestrate work flow of task for you pipeline, Apache Airflow offers to build your Dynamic Acyclic Graphs(DAG) of your tasks and its dependencies. 29 | 30 | ### Power BI 31 | Microsoft Power BI is the visualization tool, has integrated with wide variety of datasources and you can get the free desktop version to play and devolop visualization. 32 | Its great tool with in-memory technology for transforming power queries and applying to dataset. 33 | 34 | ### Coding 35 | Python BOTO3 package is integrated with all the AWS resources additionally you can package any class with you AWS lambda processing environment. 36 | 37 | 38 | I use python for data manipulation and to develop custom classes. 39 | SQL used to interact with relational databases. 40 | 41 | * Python 42 | * SQL 43 | ### IDE 44 | 45 | There are many Integrated Development environments, I use VS code for python developments and you can also download AWS toolkit. 46 | AWS toolkit helps to connect to cloud and interact with Lamda functions. 47 | * VS Code 48 | * Jupiter Note Book 49 | 50 | # How they work 51 | 52 | ### Lambda and S3 53 | 54 | Lambda function invocation based on file creation event in S3, 55 | * [head on to the blog post, to know how these are implemented](https://www.teamdatascience.com/post/how-to-process-simple-data-stream-and-consume-with-lambda) 56 | 57 | ### Kinesis and Dynamodb 58 | 59 | Kinesis acts as a buffer for streaming data 60 | * [click the link to see how steam data is stored to a NoSQL store](https://www.teamdatascience.com/post/how-to-write-kinesis-data-stream-to-dynamodb) 61 | 62 | ### EC2 and RDS 63 | 64 | Amazon EC2 is used to spin up a virtual machine with preferred operating system, either you can host you application or run multiple applications by pulling docker images. 65 | 66 | Amazon RDS is a SAAS service where you can run relation database management systems like SQL Server, Oracle and MySQL 67 | 68 | # Tools set up methods 69 | 70 | AWS stack are configured using management console. 71 | One should create a AWS account, which is a root user. 72 | Best practice is to create an IAM user and assign required privileges, prefer to use IAM user for interacting with AWS resources. 73 | 74 | * python code to connect 75 | ```python 76 | import boto3 77 | s3 = boto3.resource( 78 | 's3', 79 | region_name='ap-southeast-2', 80 | aws_access_key_id=<>, 81 | aws_secret_access_key=<> 82 | ) 83 | ``` 84 |
85 |
you should mask AWS access key's before uploading scripts to internet ( blogs, git and other), once exposed any one can access your cloud account
86 |
87 | 88 | -------------------------------------------------------------------------------- /Contents/Tools_Detail.MD: -------------------------------------------------------------------------------- 1 | * [Connect](#Connect) 2 | * Buffer 3 | * Processing 4 | * [Storage](#Storage) 5 | * Visualization 6 | 7 | 8 | 9 | 10 | ## Connect 11 | ## Buffer 12 | ## Processing 13 | ## Storage 14 | 15 | Storage solution depends on the purpose. I stored my CSV data sets on AWS S3, inturn acts as data source for my pipeline. 16 | 17 | Down the line I am writing the each row of CSV data to a NoSQL database called DynamoDB. 18 | 19 | ### Security 20 | 21 | To use DynamoDB programatically you need *Access Keys*. 22 | 23 | 24 | Access Key = Access Key ID + Secret Access Key 25 | 26 | IAM user must have application-autoscaling permission which is provided by AWS managed policy DynamoDBFullAccess 27 | ## Visualization 28 | -------------------------------------------------------------------------------- /Images/RBAC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/RBAC.png -------------------------------------------------------------------------------- /Images/daily_batch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/daily_batch.jpg -------------------------------------------------------------------------------- /Images/data_view.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/data_view.PNG -------------------------------------------------------------------------------- /Images/dtree.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/dtree.PNG -------------------------------------------------------------------------------- /Images/packagetree.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/packagetree.PNG -------------------------------------------------------------------------------- /Images/readme.MD: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Images/stream_processing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/stream_processing.jpg -------------------------------------------------------------------------------- /Images/testresult.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/testresult.PNG -------------------------------------------------------------------------------- /Images/tools_used.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/tools_used.jpg -------------------------------------------------------------------------------- /Images/ubuntu.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/ubuntu.PNG -------------------------------------------------------------------------------- /Images/ui1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/ui1.png -------------------------------------------------------------------------------- /Images/ui2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/ui2.png -------------------------------------------------------------------------------- /Images/ui3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/ui3.png -------------------------------------------------------------------------------- /Images/visual.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/visual.gif -------------------------------------------------------------------------------- /Images/win_feature.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/win_feature.PNG -------------------------------------------------------------------------------- /Pipelines: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering with AWS 2 | Data Pipe Lines with AWS 3 | 4 | # Introduction & Goals 5 | 6 | I have been using on-premise IT infrastructure platform for storing and processing the Data. 7 | 8 | To address real time analytics with big data, I utilized **AWS serverless** architecture to design and develop data pipeline. 9 | 10 | This project took the general [blueprint](https://github.com/andkret/Cookbook/blob/master/sections/01-Introduction.md#my-data-science-platform-blueprint) as a reference to 11 | identify the tools required at regular intervals of pipeline design. 12 | 13 | This Project has mainly classified into three phases, 14 | 15 | **Initial phase :** Data Set selection, based on the industry research. 16 | **Second Phase :** Designing and developing the pipeline with tools. 17 | **Third Phase :** Documentation for the project and grouping the code used in various phases of development. 18 | 19 | #### About my data set 20 | 21 | I conducted a research to choose the type of industry and banking was my preference. 22 | 23 | refer to my 24 | [blog post](https://www.teamdatascience.com/post/dba-focus-to-work-as-data-engineer) 25 | for a quick idea, when you are in dilemma of data set selection. 26 | 27 | 28 | I selected credit card complaints data and selection process has been explained in my [blog](https://www.teamdatascience.com/post/data-sets) 29 | 30 | #### Tools that I used 31 | 32 | Quick snapshot of tools used in this project are captured below, 33 | 34 | 35 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/tools_used.jpg) 36 | 37 | *Amazon Web Services, Docker for Amazon Machine Images, Airflow on Ubuntu, Python IDE and BI tools* 38 | #### What i did 39 | 40 | I utilized managed services provided by Amazon Web Services, which offer greater capability at scale and optimal performance, 41 | 42 | Data pipelines in this project uses, **Lambda** as a processing environment. To handle streaming data **Kinesis** will come to rescue and they acts as a buffer too. 43 | 44 | Eventually data is stored in **Redshift, DynamoDB or S3 type** of storage solutions. 45 | 46 | Visualization tools or analytics products are connected to these data sources and used for data analytics. 47 | 48 | #### Conclusion 49 | 50 | Basically this projects has been designed by assuming the huge data size, so that the cloud resources are economically scaled. 51 | 52 | 53 | # Contents 54 | 55 | - [The Data Set](#the-data-set) 56 | - [Used Tools](#used-tools) 57 | - [Pipelines](#pipelines) 58 | - [Demo](#demo) 59 | - [Conclusion](#conclusion) 60 | - [Follow Me On](#follow-me-on) 61 | - [Appendix](#appendix) 62 | 63 | 64 | # The Data Set 65 | - [About my Data Set](Contents/Dataset.MD) 66 | - [Why did I choose it?](Contents/Dataset.MD) 67 | - [What do you like about it?](Contents/Dataset.MD) 68 | - [What is problematic?](Contents/Dataset.MD) 69 | - [What do you want to do with it?](Contents/Dataset.MD) 70 | 71 | # Used Tools 72 | - [Tools I used](Contents/Tools.MD) 73 | - [How do they work]((Contents/Tools.MD)) 74 | - [Why I choose them](Contents/Tools.MD) 75 | - [Tools Setup](Contents/Tools.MD) 76 | 77 | # Pipelines 78 | - [Stream Processing](Contents/Pipelines.MD) 79 | - [Storing Data Stream](Contents/Pipelines.MD) 80 | - [Processing Data Stream](Contents/Pipelines.MD) 81 | - [Batch Processing](Contents/Pipelines.MD) 82 | - [Visualizations](Contents/Pipelines.MD) 83 | 84 | # Demo 85 | - You could add a demo video here 86 | - Or link to your presentation video of the project 87 | 88 | # Conclusion 89 | Write a comprehensive conclusion. 90 | - I created end-to-end data pipeline to deliver real time analytics with data available to BI systems with latency in seconds. 91 | 92 | 93 | #### Things I learned 94 | I learnt about 95 | - Different AWS Services like Lambda, Redshift, DynamoDB and other 96 | - User roles and policies 97 | - How to encode and decode the data while dealing with multiple AWS services 98 | - How to integrate services with Python using BOTO3 99 | - How the scaling of resources play major role in planning 100 | #### Challenges that I Faced 101 | 102 | ##### NoSQL 103 | I picked DynamoDB as one of my storage option. Data modelling is quite opposite to traditional RDBMS models, Amazon re invent videos and blogs helped me to understand about DynamoDB. 104 | ##### Python Package Deployment 105 | To use any Python package other than BOTO3, required libraries need to packaged with Lambda function. Figuring out the right way to deploy Python packages is a challenging task. 106 | 107 | # Follow Me On 108 | 109 | 110 | [My Linkedin profile](https://www.linkedin.com/in/kvbr/) 111 | 112 | # Appendix 113 | 114 | [Markdown Cheat Sheet](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) 115 | -------------------------------------------------------------------------------- /README.md.template.txt: -------------------------------------------------------------------------------- 1 | 2 | ------------------------------------------------------------------ 3 | !!! Rename this file to README.md and upload it to your GitHub !!! 4 | ------------------------------------------------------------------ 5 | 6 | # Title Of Your Project 7 | Add a catchy title to your project. Something that people immediately know what you are doing 8 | 9 | # Introduction & Goals 10 | - Introduce your project to the reader 11 | - Orient this section on the Table of contents 12 | - Write this like an executive summary 13 | - With what data are you working 14 | - What tools are you using 15 | - What are you doing with these tools 16 | - Once you are finished add the conclusion here as well 17 | 18 | # Contents 19 | 20 | - [The Data Set](#the-data-set) 21 | - [Used Tools](#used-tools) 22 | - [Connect](#connect) 23 | - [Buffer](#buffer) 24 | - [Processing](#processing) 25 | - [Storage](#storage) 26 | - [Visualization](#visualization) 27 | - [Pipelines](#pipelines) 28 | - [Stream Processing](#stream-processing) 29 | - [Storing Data Stream](#storing-data-stream) 30 | - [Processing Data Stream](#processing-data-stream) 31 | - [Batch Processing](#batch-processing) 32 | - [Visualizations](#visualizations) 33 | - [Demo](#demo) 34 | - [Conclusion](#conclusion) 35 | - [Follow Me On](#follow-me-on) 36 | - [Appendix](#appendix) 37 | 38 | 39 | # The Data Set 40 | - Explain the data set 41 | - Why did you choose it? 42 | - What do you like about it? 43 | - What is problematic? 44 | - What do you want to do with it? 45 | 46 | # Used Tools 47 | - Explain which tools do you use and why 48 | - How do they work (don't go too deep into details, but add links) 49 | - Why did you choose them 50 | - How did you set them up 51 | 52 | ## Connect 53 | ## Buffer 54 | ## Processing 55 | ## Storage 56 | ## Visualization 57 | 58 | # Pipelines 59 | - Explain the pipelines for processing that you are building 60 | - Go through your development and add your source code 61 | 62 | ## Stream Processing 63 | ### Storing Data Stream 64 | ### Processing Data Stream 65 | ## Batch Processing 66 | ## Visualizations 67 | 68 | # Demo 69 | - You could add a demo video here 70 | - Or link to your presentation video of the project 71 | 72 | # Conclusion 73 | Write a comprehensive conclusion. 74 | - How did this project turn out 75 | - What major things have you learned 76 | - What were the biggest challenges 77 | 78 | # Follow Me On 79 | Add the link to your LinkedIn Profile 80 | 81 | # Appendix 82 | 83 | [Markdown Cheat Sheet](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) 84 | -------------------------------------------------------------------------------- /dynamo.py: -------------------------------------------------------------------------------- 1 | 2 | import boto3 3 | import json 4 | import base64 5 | 6 | # Accessing the resource for writing stream data 7 | s3 = boto3.resource( 8 | 's3', 9 | region_name='<>', 10 | aws_access_key_id='<>', 11 | aws_secret_access_key='<>' 12 | ) 13 | dynamodb = boto3.resource('dynamodb', 14 | region_name='<>', 15 | aws_access_key_id='<>', 16 | aws_secret_access_key='<>') 17 | 18 | table = dynamodb.Table('cc_complaint') 19 | ev_lst = [] 20 | payload = b'low' 21 | 22 | def lambda_handler(event, context): 23 | #print(event) 24 | for record in event['Records']: 25 | 26 | # Kinesis data is base64 encoded so decode here 27 | 28 | payload=base64.b64decode(record["kinesis"]["data"]) 29 | 30 | 31 | # Writing data to Dynamo 32 | 33 | main_lst = payload.decode().split('\r\n') 34 | 35 | # Assign first index value as keys 36 | keys = main_lst[0].split(',') 37 | #print(len(main_lst)) 38 | #print(main_lst) 39 | # Loop through the records in uploaded CSV file from index 1 40 | for j in range(1, len(main_lst)-1): 41 | 42 | # Split each list 43 | val = main_lst[j].split(',') 44 | arr ={} 45 | print("lenth of values:",len(val)) 46 | print("length ok keys:",len(keys)) 47 | # Loop through keys to create item 48 | for i in range(len(val)): 49 | key = keys[i] 50 | value = val[i] 51 | if value is None: 52 | value = "5" 53 | if key == 'Complain_ID': 54 | print('yes complaint id') 55 | arr[key] = int(value) 56 | else: 57 | arr[key] = value 58 | print(arr) 59 | table.put_item(Item = arr) 60 | #s3.Object('kotharv-target', 'test.txt').put(Body=payload) 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /kinesis_to_s3.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import base64 3 | 4 | # Accessing the resource for writing stream data 5 | s3 = boto3.resource( 6 | 's3', 7 | region_name='ap-southeast-2', 8 | aws_access_key_id=<>, 9 | aws_secret_access_key=<> 10 | ) 11 | ev_lst = [] 12 | def lambda_handler(event, context): 13 | 14 | for record in event['Records']: 15 | 16 | #Kinesis data is base64 encoded so decode here 17 | payload=base64.b64decode(record["kinesis"]["data"]) 18 | ev_lst.append(payload) 19 | 20 | # writing data to S3 21 | s3.Object('kotharv-target', 'test.txt').put(Body=str(ev_lst)) 22 | -------------------------------------------------------------------------------- /s3_to_kinesis.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | #from urllib.parse import unquote_plus 4 | import time 5 | import csv 6 | 7 | s3 = boto3.client('s3') 8 | kinesis = boto3.client('kinesis') 9 | 10 | def lambda_handler(event, context): 11 | if event: 12 | # Read bucketname, filename from event record 13 | file_obj = event["Records"][0] 14 | bucketname = file_obj['s3']['bucket']['name'] 15 | filename = file_obj['s3']['object']['key'] 16 | 17 | # Get referenc to file on s3 bucket 18 | fileObj = s3.get_object(Bucket=bucketname, Key=filename) 19 | 20 | # Convert the data in file 21 | file_content = fileObj["Body"].read().decode('utf-8') 22 | 23 | # put the record to kinesis 24 | 25 | kinesis.put_record(Data=bytes(file_content, 'utf-8'), StreamName='my_fake_stream',PartitionKey='vjfirst') 26 | 27 | 28 | return "thanks" -------------------------------------------------------------------------------- /web_scrape_jobs.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | link = 'https://www.seek.com.au/data-engineer-jobs-in-information-communication-technology' 4 | h_class = '_3MPUOLE' 5 | l_class = '_2e4Pi2B' 6 | base_url = 'https://www.seek.com.au' 7 | 8 | # extracts data by parsing html class 9 | def scrape_data(URL,cls): 10 | page = requests.get(URL) 11 | soup = BeautifulSoup(page.content, 'html.parser') 12 | results = soup.find(class_=cls) 13 | return results 14 | 15 | # extact skill list 16 | def skill_list(URL,cls): 17 | page = requests.get(URL) 18 | soup = BeautifulSoup(page.content, 'html.parser') 19 | results = soup.find(class_=cls) 20 | key = results.find_all('ul') 21 | for key in results.find_all('ul'): 22 | print(key.text) 23 | 24 | # extracts job, location, company and skills from data by passing different html class 25 | 26 | def extact_from_data(cls1,cls2,cls3): 27 | results = scrape_data(link,h_class) 28 | for result in results: 29 | # Each job_elem is a new BeautifulSoup object. 30 | # You can use the same methods on it as you did before. 31 | title_elem = result.find('a', class_=cls1) 32 | location_elem = result.find('strong', class_=cls2) 33 | company_elem = result.find('a', class_=cls3) 34 | job_link = base_url+title_elem['href'] 35 | print(title_elem.text.strip()) 36 | print(location_elem.text.strip()) 37 | print(company_elem.text.strip()) 38 | skill_list(job_link,l_class) 39 | print() 40 | 41 | extact_from_data('_2iNL7wI','lwHBT6d','_3AMdmRg') --------------------------------------------------------------------------------