├── .vscode
    ├── .ropeproject
    │   └── config.py
    └── settings.json
├── Blogs_Code
    ├── Airflow_Configuration.MD
    ├── DAG
    │   ├── Readme.md
    │   ├── lambda_hook_invoke.py
    │   └── lambda_invoke.py
    ├── New Text Document.txt
    ├── record_transform.py
    ├── sam_deploy.zip
    ├── send_records.py
    └── win_feature.PNG
├── Contents
    ├── Dataset.MD
    ├── Pipelines.MD
    ├── Tools.MD
    └── Tools_Detail.MD
├── Images
    ├── RBAC.png
    ├── daily_batch.jpg
    ├── data_view.PNG
    ├── dtree.PNG
    ├── packagetree.PNG
    ├── readme.MD
    ├── stream_processing.jpg
    ├── testresult.PNG
    ├── tools_used.jpg
    ├── ubuntu.PNG
    ├── ui1.png
    ├── ui2.png
    ├── ui3.png
    ├── visual.gif
    └── win_feature.PNG
├── Pipelines
├── README.md
├── README.md.template.txt
├── dynamo.py
├── kinesis_to_s3.py
├── s3_to_kinesis.py
└── web_scrape_jobs.py


/.vscode/.ropeproject/config.py:
--------------------------------------------------------------------------------
  1 | # The default ``config.py``
  2 | # flake8: noqa
  3 | 
  4 | 
  5 | def set_prefs(prefs):
  6 |     """This function is called before opening the project"""
  7 | 
  8 |     # Specify which files and folders to ignore in the project.
  9 |     # Changes to ignored resources are not added to the history and
 10 |     # VCSs.  Also they are not returned in `Project.get_files()`.
 11 |     # Note that ``?`` and ``*`` match all characters but slashes.
 12 |     # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc'
 13 |     # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc'
 14 |     # '.svn': matches 'pkg/.svn' and all of its children
 15 |     # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o'
 16 |     # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o'
 17 |     prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject',
 18 |                                   '.hg', '.svn', '_svn', '.git', '.tox']
 19 | 
 20 |     # Specifies which files should be considered python files.  It is
 21 |     # useful when you have scripts inside your project.  Only files
 22 |     # ending with ``.py`` are considered to be python files by
 23 |     # default.
 24 |     #prefs['python_files'] = ['*.py']
 25 | 
 26 |     # Custom source folders:  By default rope searches the project
 27 |     # for finding source folders (folders that should be searched
 28 |     # for finding modules).  You can add paths to that list.  Note
 29 |     # that rope guesses project source folders correctly most of the
 30 |     # time; use this if you have any problems.
 31 |     # The folders should be relative to project root and use '/' for
 32 |     # separating folders regardless of the platform rope is running on.
 33 |     # 'src/my_source_folder' for instance.
 34 |     #prefs.add('source_folders', 'src')
 35 | 
 36 |     # You can extend python path for looking up modules
 37 |     #prefs.add('python_path', '~/python/')
 38 | 
 39 |     # Should rope save object information or not.
 40 |     prefs['save_objectdb'] = True
 41 |     prefs['compress_objectdb'] = False
 42 | 
 43 |     # If `True`, rope analyzes each module when it is being saved.
 44 |     prefs['automatic_soa'] = True
 45 |     # The depth of calls to follow in static object analysis
 46 |     prefs['soa_followed_calls'] = 0
 47 | 
 48 |     # If `False` when running modules or unit tests "dynamic object
 49 |     # analysis" is turned off.  This makes them much faster.
 50 |     prefs['perform_doa'] = True
 51 | 
 52 |     # Rope can check the validity of its object DB when running.
 53 |     prefs['validate_objectdb'] = True
 54 | 
 55 |     # How many undos to hold?
 56 |     prefs['max_history_items'] = 32
 57 | 
 58 |     # Shows whether to save history across sessions.
 59 |     prefs['save_history'] = True
 60 |     prefs['compress_history'] = False
 61 | 
 62 |     # Set the number spaces used for indenting.  According to
 63 |     # :PEP:`8`, it is best to use 4 spaces.  Since most of rope's
 64 |     # unit-tests use 4 spaces it is more reliable, too.
 65 |     prefs['indent_size'] = 4
 66 | 
 67 |     # Builtin and c-extension modules that are allowed to be imported
 68 |     # and inspected by rope.
 69 |     prefs['extension_modules'] = []
 70 | 
 71 |     # Add all standard c-extensions to extension_modules list.
 72 |     prefs['import_dynload_stdmods'] = True
 73 | 
 74 |     # If `True` modules with syntax errors are considered to be empty.
 75 |     # The default value is `False`; When `False` syntax errors raise
 76 |     # `rope.base.exceptions.ModuleSyntaxError` exception.
 77 |     prefs['ignore_syntax_errors'] = False
 78 | 
 79 |     # If `True`, rope ignores unresolvable imports.  Otherwise, they
 80 |     # appear in the importing namespace.
 81 |     prefs['ignore_bad_imports'] = False
 82 | 
 83 |     # If `True`, rope will insert new module imports as
 84 |     # `from <package> import <module>` by default.
 85 |     prefs['prefer_module_from_imports'] = False
 86 | 
 87 |     # If `True`, rope will transform a comma list of imports into
 88 |     # multiple separate import statements when organizing
 89 |     # imports.
 90 |     prefs['split_imports'] = False
 91 | 
 92 |     # If `True`, rope will remove all top-level import statements and
 93 |     # reinsert them at the top of the module when making changes.
 94 |     prefs['pull_imports_to_top'] = True
 95 | 
 96 |     # If `True`, rope will sort imports alphabetically by module name instead of
 97 |     # alphabetically by import statement, with from imports after normal
 98 |     # imports.
 99 |     prefs['sort_imports_alphabetically'] = False
100 | 
101 |     # Location of implementation of rope.base.oi.type_hinting.interfaces.ITypeHintingFactory
102 |     # In general case, you don't have to change this value, unless you're an rope expert.
103 |     # Change this value to inject you own implementations of interfaces
104 |     # listed in module rope.base.oi.type_hinting.providers.interfaces
105 |     # For example, you can add you own providers for Django Models, or disable the search
106 |     # type-hinting in a class hierarchy, etc.
107 |     prefs['type_hinting_factory'] = 'rope.base.oi.type_hinting.factory.default_type_hinting_factory'
108 | 
109 | 
110 | def project_opened(project):
111 |     """This function is called after opening the project"""
112 |     # Do whatever you like here!
113 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.linting.enabled": true
3 | }


--------------------------------------------------------------------------------
/Blogs_Code/Airflow_Configuration.MD:
--------------------------------------------------------------------------------
 1 | # Apache Airflow Configuration
 2 | 
 3 | I configured Airflow on windows desktop by enabling the Linux subsystem and downloading the Ubuntu app from Microsoft store.  
 4 | This can also be achieved on any kind of Linux flavor running platform( may be slight change in commands)
 5 | ## Table of Contents
 6 | 
 7 |   - [Enable Linux Subsystem](#Enable-Linux-Subsystem)
 8 |   - [Download ubuntu from Microsoft store](#Download-ubuntu-from-Microsoft-store)
 9 |   - [Prerequisites](#Prerequisites)
10 |   - [Virtual Environment](#Virtual-Environment)
11 |   - [Airflow Installation](#Airflow-Installation)
12 |   - [Creating Airflow home](#Creating-Airflow-home)
13 |   - [Initialize Airflow DB](#Initialize-Airflow-DB)
14 |   - [Start Airflow Scheduler](#Start-Airflow-Scheduler)
15 |   
16 | ## Enable Linux Subsystem
17 | ### windows > Turn Windows features on or off
18 | 
19 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/win_feature.PNG)
20 | 
21 | enable the highlighted feature
22 | 
23 | 
24 | ## Download ubuntu from Microsoft store
25 | 
26 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/ubuntu.PNG)
27 | 
28 | Launching app take you to bash terminal, run below commands
29 | 
30 | 
31 | ## Prerequisites
32 | 
33 | update Ubuntu with latest packages
34 | 
35 | ```console
36 | kotharv@vijay-dell:~$ sudo apt-get update
37 | kotharv@vijay-dell:~$ sudo apt-get upgrade
38 | kotharv@vijay-dell:~$ sudo su
39 | ```
40 | install `pip` for downloading python modules
41 | `virtualenv` to create a virtual space for Airflow 
42 | 
43 | ```console
44 | root@vijay-dell:/home/kotharv# apt install python-pip
45 | root@vijay-dell:/home/kotharv# sudo apt install python3-pip
46 | root@vijay-dell:/home/kotharv# apt install virtualenv
47 | ```
48 | ## Virtual Environment
49 | ```console
50 | root@vijay-dell:/home/kotharv# cd /mnt/e/airflow/workspace/
51 | root@vijay-dell:/mnt/e/airflow/workspace# virtualenv -p `which python3` venv
52 | root@vijay-dell:/mnt/e/airflow/workspace# source venv/bin/activate
53 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# 
54 | ```
55 | 
56 | ## Airflow Installation
57 | ```console
58 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# pip install apache-airflow
59 | ```
60 | ## Creating Airflow home
61 | 
62 | Tricky part is run  ` cd /mnt `  you will be able to see the windows drive partitions and make a directory for Airflow, 
63 | so that all the Airflow configuration stays at dedicated directory
64 | 
65 | ```console
66 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# mkdir airflow_home
67 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# export AIRFLOW_HOME=`pwd`/airflow_home
68 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# echo $AIRFLOW_HOME
69 | /mnt/e/airflow/workspace/airflow_home
70 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# airflow version
71 | 1.10.10
72 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# cd airflow_home/
73 | (venv) root@vijay-dell:/mnt/e/airflow/workspace/airflow_home# ls
74 | airflow.cfg  logs  unittests.cfg
75 | ```
76 | Above commands with set your environment variable *AIRFLOW_HOME*.
77 | 
78 | 
79 | Running `airflow version` command will generate config files in airflow home directory, it's kind of a test case to test your environment variable.
80 | ## Initialize Airflow DB
81 | 
82 | You can also configure MySQL or Postgres as back end database for your Airflow web application.
83 | 
84 | By default, SQL Lite will be configured.  It will generate .mdb file in your home directory.
85 | ```console
86 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# airflow initdb
87 | Start Airflow Webserver
88 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# airflow webserver
89 | ```
90 | ## Start Airflow Scheduler
91 | 
92 | When you create a new DAG, start scheduler to update Airflow dashboard with latest jobs.
93 | 
94 | ```console
95 | (venv) root@vijay-dell:/mnt/e/airflow/workspace# airflow scheduler
96 | ```
97 | 


--------------------------------------------------------------------------------
/Blogs_Code/DAG/Readme.md:
--------------------------------------------------------------------------------
1 | # Dynamic Acyclic Graph ( DAG ) sample python code
2 | 
3 | 
4 | *lambda_invoke.py* sample code with BOTO3
5 | 
6 | 
7 | *lambda_hook_invoke.py* sample code with AWS hook
8 | 


--------------------------------------------------------------------------------
/Blogs_Code/DAG/lambda_hook_invoke.py:
--------------------------------------------------------------------------------
 1 | # The DAG object; we'll need this to instantiate a DAG
 2 | from airflow import DAG
 3 | # Operators; we need this to operate!
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | from airflow.operators.bash_operator import BashOperator
 7 | # Datetime and other 
 8 | from datetime import datetime
 9 | import boto3, json
10 | # Importing airflow hook
11 | from airflow.contrib.hooks.aws_lambda_hook import AwsLambdaHook
12 | 
13 | # Default arguments and can be overwritten at operator initialization
14 | default_args = {
15 |         'owner': 'Vijay',
16 |         'depends_on_past': False,
17 |         'start_date': datetime(2020,5,1),
18 |         'email': ['@gmail.com'],
19 |         'email_on_failure': False,
20 |         'email_on_retry': False,
21 |         'retries': 1,
22 | }
23 | 
24 | 
25 | 
26 | # Function
27 | 
28 | def lambda1(ds,**kwargs):
29 |         
30 |         hook = AwsLambdaHook('myAirflowTest', region_name='', log_type='None',qualifier='$LATEST',invocation_type='RequestResponse',config=None,aws_conn_id='my_lambda')
31 |         response_1 = hook.invoke_lambda(payload='null')
32 |         print ('Response--->' , response_1)
33 | 
34 | # Task
35 | 
36 | # Using the context manager alllows you not to duplicate the dag parameter in each operator
37 | with DAG('invocation_hook_lambda', default_args=default_args, description='invoke a lambda in dev aws instance') as dag:
38 | 
39 |     start = DummyOperator(task_id='Begin_execution')
40 | 
41 |     t1 = PythonOperator(
42 |         task_id="lambda1",
43 |         python_callable=lambda1,
44 |         provide_context=True
45 | )
46 | 
47 |     t2 = BashOperator(
48 |         task_id="run_with_lambda",
49 |         bash_command="echo 1"
50 | )
51 | 
52 |     t3 = BashOperator(
53 |         task_id="run_with_lambda2",
54 |         bash_command="echo 1"
55 | )
56 |     end = DummyOperator(task_id='stop_execution')
57 | 
58 | 
59 | 
60 | start >>  [t1, t2] >>  t3 >>end
61 | 


--------------------------------------------------------------------------------
/Blogs_Code/DAG/lambda_invoke.py:
--------------------------------------------------------------------------------
 1 | # The DAG object; we'll need this to instantiate a DAG
 2 | from airflow import DAG
 3 | # Operators; we need this to operate!
 4 | from airflow.operators.dummy_operator import DummyOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | # Datetime and other 
 7 | from datetime import datetime
 8 | import boto3, json
 9 | 
10 | # Default arguments and can be overwritten at operator initialization
11 | default_args = {
12 |         'owner': 'Vijay',
13 |         'depends_on_past': False,
14 |         'start_date': datetime(2020,4,17),
15 |         'email': ['@gmail.com'],
16 |         'email_on_failure': False,
17 |         'email_on_retry': False,
18 |         'retries': 1,
19 | }
20 | 
21 | # DAG initialization
22 | dag = DAG(
23 |         'invocation_lambda',
24 |         default_args=default_args,
25 |         description='invoke a lambda in dev aws instance'
26 | )
27 | 
28 | # Function
29 | 
30 | def lambda1(ds,**kwargs):
31 |         lambda_client = boto3.client('lambda', 
32 |                                  region_name='',
33 |                                  aws_access_key_id='',
34 |                                  aws_secret_access_key='')
35 |         response_1 = lambda_client.invoke(FunctionName='myAirflowTest',InvocationType='RequestResponse')
36 |         print ('Response--->' , response_1)
37 | 
38 | # Task
39 | 
40 | start = DummyOperator(task_id='Begin_execution',  dag=dag)
41 | 
42 | t1 = PythonOperator(
43 |         task_id="lambda1",
44 |         python_callable=lambda1,
45 |         provide_context=True,
46 |         dag=dag
47 | )
48 | 
49 | end = DummyOperator(task_id='stop_execution',  dag=dag)
50 | 
51 | start >> t1 >> end


--------------------------------------------------------------------------------
/Blogs_Code/New Text Document.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Blogs_Code/New Text Document.txt


--------------------------------------------------------------------------------
/Blogs_Code/record_transform.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import json
 3 | from datetime import datetime
 4 | 
 5 | # Incoming Event
 6 | def lambda_handler(event, context):
 7 |     output = []
 8 |     output_record = {}
 9 |     
10 |     # Loop through records in incoming Event
11 |     for record in event['records']:
12 | 
13 |         # Extract message
14 |         
15 |         message = base64.b64decode(record['data']).decode("UTF-8")
16 | 
17 |         # Construct output and add to the list
18 |    
19 |         rec = json.loads(message)
20 |         rec['l3'] = 19
21 |         
22 |         output_record = {
23 |             'recordId': record['recordId'],
24 |             'result': 'Ok',
25 |             'data': base64.b64encode(bytes(json.dumps(rec), 'utf-8'))}
26 |         output.append(output_record)
27 | 
28 |     return {'records': output}
29 |    


--------------------------------------------------------------------------------
/Blogs_Code/sam_deploy.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Blogs_Code/sam_deploy.zip


--------------------------------------------------------------------------------
/Blogs_Code/send_records.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import json
 3 | from urllib.parse import unquote_plus
 4 | import time
 5 | import csv
 6 | import base64
 7 | 
 8 | s3 = boto3.client('s3')
 9 | kinesis = boto3.client('kinesis')
10 | 
11 | def lambda_handler(event, context):
12 |     my_records = []
13 |     
14 |     
15 |     if event:
16 |         # Read bucketname, filename from event record
17 | 
18 |         file_obj = event["Records"][0]
19 |         bucketname = file_obj['s3']['bucket']['name']
20 |         filename = file_obj['s3']['object']['key']
21 | 
22 |         # Get referenc to file on s3 bucket
23 | 
24 |         fileObj = s3.get_object(Bucket=bucketname, Key=filename)
25 |        
26 |         # Reading data with decoding bytes
27 | 
28 |         file_content = fileObj["Body"].read().decode('utf-8')
29 | 
30 |         # Split to extact each record to a list
31 | 
32 |         file_rows = file_content.split()
33 | 
34 |         # prepare the record and add to record list
35 |         
36 |         for row in range(len(file_rows)):
37 |             pk = 'vjcool'+str(row)
38 |             record = {
39 |                 'Data': bytes(file_rows[row], 'utf-8'),
40 |                 
41 |                 'PartitionKey': pk
42 |             }
43 |             my_records.append(record)
44 |        
45 |         
46 |         # put the record to kinesis
47 |        
48 |         kinesis.put_records(Records= my_records, StreamName='my_cc_data')
49 |         
50 |         
51 |        
52 | 
53 |     return "thanks"


--------------------------------------------------------------------------------
/Blogs_Code/win_feature.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Blogs_Code/win_feature.PNG


--------------------------------------------------------------------------------
/Contents/Dataset.MD:
--------------------------------------------------------------------------------
 1 | # About My Dataset
 2 | I used credit card complaints data set for my project. It has 18 columns and 80k rows. Columns are of Date, Text and Number types.  The dataset is a CSV file. 
 3 | You can find the dataset here, [Credit Card Complaints]( https://data.world/dataquest/bank-and-credit-card-complaints)
 4 | 
 5 | have a look at quick view of dataset,
 6 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/data_view.PNG)
 7 | # Why i choose it?
 8 | When I started to peruse Data Engineering, I did a research on industry based on jobs available on job portal.  Banking and Telecommunication are the top two sectors posting jobs locally.  It should be relevant to do hands on project relating to these industries and that made made me to narrow down the data set.
 9 | # What i like in the dataset?
10 | Its a good candidate to draw some insights and calculate statistics like average, total complaints group by location and time series analysis.
11 | # What is Problematic?
12 | One of the column 'Sub-Issue' doesn't have any values, we need to manually impute values.  Imputation can be based on context of dataset usage.
13 | Eg: In my case I collected the Sub-Issue types and populated the column randomly.
14 | # What i want to achieve?
15 | I will use this data as a source for my data pipelines to address some of the problem statements
16 | 1. Analytics with Steaming data 
17 | 


--------------------------------------------------------------------------------
/Contents/Pipelines.MD:
--------------------------------------------------------------------------------
 1 | * [Pipelines](#Pipelines)
 2 |   - [Stream Processing](#stream-processing)
 3 |     - [Processing Data Stream](#processing-data-stream)
 4 |     - [Storing Data Stream](#storing-data-stream)
 5 |   - [Batch Processing](#Batch-processing)
 6 |   - [Visualization](#visualization)
 7 | # Pipelines
 8 | 
 9 | Data Analytics is derived form various data sources and intermediate transformations.  Pipeline is created with *connect, storage, process, buffer and visualization* elements used to achieve organizational data goal.
10 | ## Stream Processing
11 | 
12 | Steam Processing is analyzing the data in real time.  Log from application server, click stream from websites, mobile apps and IOT learn about application, products and customers with the help of streaming data solutions.
13 | 
14 | I created a pipe line to visualize credit cards complaints stream.
15 | 
16 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/stream_processing.jpg)
17 | 
18 | Let me take you through the solution,
19 | 
20 | ### Connect
21 | 
22 | I manually crated a python script to upload JSON file to AWS S3 bucket, which acts as my connect stream data source.
23 | 
24 | ### Processing Data Stream
25 | 
26 | AWS lambda function is invoked for S3 bucket object cerated event.  File content will be transferred as record to Kinesis stream.
27 | 
28 | ### Buffer
29 | 
30 | Kinesis Data Stream acts as a buffer for real time data and Kinesis firehouse is my consumer, which acts as  data transformation service and delivers data to Amazon Redshift.
31 | 
32 | ### Storing Data Stream
33 | 
34 | Data is eventually stored in a Redshift, which is connected to analytics tools.
35 | 
36 | 
37 | ## Batch Processing
38 | 
39 | To eliminate the overhead of normalization the data has been stored to DynamoDB.  A daily batch process will load the data into SQL database for visualization or predictive analytics.
40 | 
41 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/daily_batch.jpg)
42 | 
43 | ## Visualization
44 | 
45 | Power BI desktop has been configured to AWS redshift data source to answer some of the analytical queries for business.
46 | 
47 | ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/visual.gif)
48 | 


--------------------------------------------------------------------------------
/Contents/Tools.MD:
--------------------------------------------------------------------------------
 1 | * [Tools Used](#Tools-Used)
 2 |   - [AWS](#AWS-products-used)
 3 |   - [Docker](#Docker)
 4 |   - [Apache Airflow](#Apache-Airflow)
 5 |   - [Power BI](#Power-BI)
 6 |   - [Coding](#Coding)
 7 |   - [IDE](#IDE)
 8 | * [How they work](#How-they-work)
 9 |   - [Lambda and S3](#Lambda-and-s3)
10 |   - [Kinesis and Dynamodb](#Kinesis-and-Dynamodb)
11 |   - [EC2, RDS](#EC2,-RDS)
12 | * [Tools set up method](#Tools-set-up-methods)
13 | # Tools Used
14 | 
15 | My technical stack decision was made based on the results of researching the job postings.  AWS skills were on demand on most of the job description and it is used widely around the world.
16 | ### AWS products used
17 | * Amazon S3
18 | * Amazon Kinesis Data Streams
19 | * AWS Lambda
20 | * Dynamo DB
21 | * Amazon RDS
22 | * Amazon EC2
23 | 
24 | ### Docker
25 | Docker is becoming more popular for its easy to use image concept.  Literally you can get any kind of operating environment by, just pulling a container image on to your host OS.
26 | 
27 | ### Apache Airflow
28 | To Orchestrate work flow of task for you pipeline, Apache Airflow offers to build your Dynamic Acyclic Graphs(DAG) of your tasks and its dependencies.
29 | 
30 | ### Power BI
31 | Microsoft Power BI is the visualization tool, has integrated with wide variety of datasources and you can get the free desktop version to play and devolop visualization.
32 | Its great tool with in-memory technology for transforming power queries and applying to dataset.
33 | 
34 | ### Coding
35 | Python BOTO3 package is integrated with all the AWS resources additionally you can package any class with you AWS lambda processing environment.
36 | 
37 | 
38 | I use python for data manipulation and to develop custom classes.
39 | SQL used to interact with relational databases.
40 | 
41 | * Python
42 | * SQL
43 | ### IDE
44 | 
45 | There are many Integrated Development environments, I use VS code for python developments and you can also download AWS toolkit.
46 | AWS toolkit helps to connect to cloud and interact with Lamda functions.
47 | * VS Code
48 | * Jupiter Note Book
49 | 
50 | # How they work
51 | 
52 | ### Lambda and S3
53 | 
54 | Lambda function invocation based on file creation event in S3,
55 | * [head on to the blog post, to know how these are implemented](https://www.teamdatascience.com/post/how-to-process-simple-data-stream-and-consume-with-lambda)
56 | 
57 | ### Kinesis and Dynamodb
58 | 
59 | Kinesis acts as a buffer for streaming data
60 | * [click the link to see how steam data is stored to a NoSQL store](https://www.teamdatascience.com/post/how-to-write-kinesis-data-stream-to-dynamodb)
61 | 
62 | ### EC2 and RDS
63 | 
64 | Amazon EC2 is used to spin up a virtual machine with preferred operating system, either you can host you application or run multiple applications by pulling docker images.
65 | 
66 | Amazon RDS is a SAAS service where you can run relation database management systems like SQL Server, Oracle and MySQL
67 | 
68 | # Tools set up methods
69 | 
70 | AWS stack are configured using management console.
71 | One should create a AWS account, which is a root user.
72 | Best practice is to create an IAM user and assign required privileges, prefer to use IAM user for interacting with AWS resources.  
73 | 
74 | * python code to connect
75 | ```python
76 | import boto3
77 | s3 = boto3.resource(
78 |                      's3',
79 |                       region_name='ap-southeast-2',
80 |                       aws_access_key_id=<>,
81 |                       aws_secret_access_key=<>
82 |                       )
83 | ```
84 | <dl>
85 | <dt> you should mask AWS access key's before uploading scripts to internet ( blogs, git and other), once exposed any one can access your cloud account </dt>
86 | </dl>
87 | 
88 | 


--------------------------------------------------------------------------------
/Contents/Tools_Detail.MD:
--------------------------------------------------------------------------------
 1 | * [Connect](#Connect)
 2 | * Buffer
 3 | * Processing
 4 | * [Storage](#Storage)
 5 | * Visualization
 6 | 
 7 | 
 8 | 
 9 | 
10 | ## Connect
11 | ## Buffer
12 | ## Processing
13 | ## Storage
14 | 
15 | Storage solution depends on the purpose.  I stored my CSV data sets on AWS S3, inturn acts as data source for my pipeline.
16 | 
17 | Down the line I am writing the each row of CSV data to a NoSQL database called DynamoDB.
18 | 
19 | ### Security
20 | 
21 | To use DynamoDB programatically you need *Access Keys*.
22 | 
23 | 
24 | Access Key = Access Key ID + Secret Access Key
25 | 
26 | IAM user must have application-autoscaling permission which is provided by AWS managed policy DynamoDBFullAccess
27 | ## Visualization
28 | 


--------------------------------------------------------------------------------
/Images/RBAC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/RBAC.png


--------------------------------------------------------------------------------
/Images/daily_batch.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/daily_batch.jpg


--------------------------------------------------------------------------------
/Images/data_view.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/data_view.PNG


--------------------------------------------------------------------------------
/Images/dtree.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/dtree.PNG


--------------------------------------------------------------------------------
/Images/packagetree.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/packagetree.PNG


--------------------------------------------------------------------------------
/Images/readme.MD:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Images/stream_processing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/stream_processing.jpg


--------------------------------------------------------------------------------
/Images/testresult.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/testresult.PNG


--------------------------------------------------------------------------------
/Images/tools_used.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/tools_used.jpg


--------------------------------------------------------------------------------
/Images/ubuntu.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/ubuntu.PNG


--------------------------------------------------------------------------------
/Images/ui1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/ui1.png


--------------------------------------------------------------------------------
/Images/ui2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/ui2.png


--------------------------------------------------------------------------------
/Images/ui3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/ui3.png


--------------------------------------------------------------------------------
/Images/visual.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/visual.gif


--------------------------------------------------------------------------------
/Images/win_feature.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vijaykothareddy/Data-Engineering/55bffc34b1359204320d288c4978be5796302da4/Images/win_feature.PNG


--------------------------------------------------------------------------------
/Pipelines:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Data Engineering with AWS
  2 | Data Pipe Lines with AWS
  3 | 
  4 | # Introduction & Goals
  5 | 
  6 | I have been using on-premise IT infrastructure platform for storing and processing the Data.
  7 | 
  8 | To address real time analytics with big data,  I utilized **AWS serverless** architecture to design and develop data pipeline.
  9 | 
 10 | This project took the general [blueprint](https://github.com/andkret/Cookbook/blob/master/sections/01-Introduction.md#my-data-science-platform-blueprint) as a reference to
 11 | identify the tools required at regular intervals of pipeline design.
 12 | 
 13 | This Project has mainly classified into three phases,
 14 | 
 15 |   **Initial phase :** Data Set selection, based on the industry research.
 16 |   **Second Phase :** Designing and developing the pipeline with tools.
 17 |   **Third Phase :** Documentation for the project and grouping the code used in various phases of development.
 18 | 
 19 |   #### About my data set
 20 | 
 21 |   I conducted a research to choose the type of industry and banking was my preference.
 22 |   
 23 |   refer to my 
 24 |   [blog post](https://www.teamdatascience.com/post/dba-focus-to-work-as-data-engineer) 
 25 |   for a quick idea, when you are in dilemma of data set selection.
 26 | 
 27 | 
 28 |   I selected credit card complaints data and selection process has been explained in my [blog](https://www.teamdatascience.com/post/data-sets)
 29 |   
 30 |   #### Tools that I used 
 31 | 
 32 |   Quick snapshot of tools used in this project are captured below,
 33 | 
 34 | 
 35 |   ![](https://github.com/vijaykothareddy/Data-Engineering/blob/master/Images/tools_used.jpg)
 36 | 
 37 | *Amazon Web Services, Docker for Amazon Machine Images, Airflow on Ubuntu, Python IDE and BI tools*
 38 |   #### What i did
 39 | 
 40 |   I utilized managed services provided by Amazon Web Services, which offer greater capability at scale and optimal performance,
 41 | 
 42 |   Data pipelines in this project uses, **Lambda** as a processing environment.  To handle streaming data **Kinesis** will come to rescue and they acts as a buffer too.  
 43 |   
 44 |   Eventually data is stored in **Redshift, DynamoDB or S3 type** of storage solutions.
 45 | 
 46 |   Visualization tools or analytics products are connected to these data sources and used for data analytics.
 47 | 
 48 |   #### Conclusion
 49 | 
 50 |   Basically this projects has been designed by assuming the huge data size, so that the cloud resources are economically scaled.
 51 | 
 52 | 
 53 | # Contents
 54 | 
 55 | - [The Data Set](#the-data-set)
 56 | - [Used Tools](#used-tools)
 57 | - [Pipelines](#pipelines)
 58 | - [Demo](#demo)
 59 | - [Conclusion](#conclusion)
 60 | - [Follow Me On](#follow-me-on)
 61 | - [Appendix](#appendix)
 62 | 
 63 | 
 64 | # The Data Set
 65 | - [About my Data Set](Contents/Dataset.MD)
 66 | - [Why did I choose it?](Contents/Dataset.MD)
 67 | - [What do you like about it?](Contents/Dataset.MD)
 68 | - [What is problematic?](Contents/Dataset.MD)
 69 | - [What do you want to do with it?](Contents/Dataset.MD)
 70 | 
 71 | # Used Tools
 72 | - [Tools I used](Contents/Tools.MD)
 73 | - [How do they work]((Contents/Tools.MD))
 74 | - [Why I choose them](Contents/Tools.MD)
 75 | - [Tools Setup](Contents/Tools.MD)
 76 | 
 77 | # Pipelines
 78 | - [Stream Processing](Contents/Pipelines.MD)
 79 |   - [Storing Data Stream](Contents/Pipelines.MD)
 80 |   - [Processing Data Stream](Contents/Pipelines.MD)
 81 | - [Batch Processing](Contents/Pipelines.MD)
 82 | - [Visualizations](Contents/Pipelines.MD)
 83 | 
 84 | # Demo
 85 | - You could add a demo video here
 86 | - Or link to your presentation video of the project
 87 | 
 88 | # Conclusion
 89 | Write a comprehensive conclusion.
 90 | - I created end-to-end data pipeline to deliver real time analytics with data available to BI systems with latency in seconds.
 91 | 
 92 | 
 93 | #### Things I learned 
 94 | I learnt about 
 95 |   - Different AWS Services like Lambda, Redshift, DynamoDB and other
 96 |   - User roles and policies
 97 |   - How to encode and decode the data while dealing with multiple AWS services
 98 |   - How to integrate services with Python using BOTO3
 99 |   - How the scaling of resources play major role in planning
100 | #### Challenges that I Faced
101 | 
102 |   ##### NoSQL 
103 |   I picked DynamoDB as one of my storage option. Data modelling is quite opposite to traditional RDBMS models, Amazon re invent videos and blogs helped me to understand about DynamoDB.
104 |   ##### Python Package Deployment
105 |   To use any Python package other than BOTO3, required libraries need to packaged with Lambda function.  Figuring out the right way to deploy Python packages is a challenging task.
106 | 
107 | # Follow Me On
108 | 
109 | 
110 | [My Linkedin profile](https://www.linkedin.com/in/kvbr/)
111 | 
112 | # Appendix
113 | 
114 | [Markdown Cheat Sheet](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet)
115 | 


--------------------------------------------------------------------------------
/README.md.template.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ------------------------------------------------------------------
 3 | !!! Rename this file to README.md and upload it to your GitHub !!!
 4 | ------------------------------------------------------------------
 5 | 
 6 | # Title Of Your Project
 7 | Add a catchy title to your project. Something that people immediately know what you are doing
 8 | 
 9 | # Introduction & Goals
10 | - Introduce your project to the reader
11 | - Orient this section on the Table of contents
12 | - Write this like an executive summary
13 |   - With what data are you working
14 |   - What tools are you using
15 |   - What are you doing with these tools
16 |   - Once you are finished add the conclusion here as well
17 | 
18 | # Contents
19 | 
20 | - [The Data Set](#the-data-set)
21 | - [Used Tools](#used-tools)
22 |   - [Connect](#connect)
23 |   - [Buffer](#buffer)
24 |   - [Processing](#processing)
25 |   - [Storage](#storage)
26 |   - [Visualization](#visualization)
27 | - [Pipelines](#pipelines)
28 |   - [Stream Processing](#stream-processing)
29 |     - [Storing Data Stream](#storing-data-stream)
30 |     - [Processing Data Stream](#processing-data-stream)
31 |   - [Batch Processing](#batch-processing)
32 |   - [Visualizations](#visualizations)
33 | - [Demo](#demo)
34 | - [Conclusion](#conclusion)
35 | - [Follow Me On](#follow-me-on)
36 | - [Appendix](#appendix)
37 | 
38 | 
39 | # The Data Set
40 | - Explain the data set
41 | - Why did you choose it?
42 | - What do you like about it?
43 | - What is problematic?
44 | - What do you want to do with it?
45 | 
46 | # Used Tools
47 | - Explain which tools do you use and why
48 | - How do they work (don't go too deep into details, but add links)
49 | - Why did you choose them
50 | - How did you set them up
51 | 
52 | ## Connect
53 | ## Buffer
54 | ## Processing
55 | ## Storage
56 | ## Visualization
57 | 
58 | # Pipelines
59 | - Explain the pipelines for processing that you are building
60 | - Go through your development and add your source code
61 | 
62 | ## Stream Processing
63 | ### Storing Data Stream
64 | ### Processing Data Stream
65 | ## Batch Processing
66 | ## Visualizations
67 | 
68 | # Demo
69 | - You could add a demo video here
70 | - Or link to your presentation video of the project
71 | 
72 | # Conclusion
73 | Write a comprehensive conclusion.
74 | - How did this project turn out
75 | - What major things have you learned
76 | - What were the biggest challenges
77 | 
78 | # Follow Me On
79 | Add the link to your LinkedIn Profile
80 | 
81 | # Appendix
82 | 
83 | [Markdown Cheat Sheet](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet)
84 | 


--------------------------------------------------------------------------------
/dynamo.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import boto3
 3 | import json
 4 | import base64
 5 | 
 6 | # Accessing the resource for writing stream data
 7 | s3 = boto3.resource(
 8 |                      's3',
 9 |                       region_name='<>',
10 |                       aws_access_key_id='<>',
11 |                       aws_secret_access_key='<>'
12 |                       )
13 | dynamodb = boto3.resource('dynamodb',
14 |                             region_name='<>',
15 |                       aws_access_key_id='<>',
16 |                       aws_secret_access_key='<>')
17 |                       
18 | table = dynamodb.Table('cc_complaint')
19 | ev_lst = []
20 | payload = b'low'
21 | 
22 | def lambda_handler(event, context):
23 |     #print(event)
24 |     for record in event['Records']:
25 |        
26 |        # Kinesis data is base64 encoded so decode here
27 |        
28 |        payload=base64.b64decode(record["kinesis"]["data"])
29 |        
30 |        
31 |     # Writing data to Dynamo   
32 |     
33 |     main_lst = payload.decode().split('\r\n')
34 |     
35 |     # Assign first index value as keys
36 |     keys = main_lst[0].split(',')
37 |     #print(len(main_lst))
38 |     #print(main_lst)
39 |     # Loop through the records in uploaded CSV file from index 1
40 |     for j in range(1, len(main_lst)-1):
41 |         
42 |         # Split each list
43 |         val = main_lst[j].split(',')
44 |         arr ={}
45 |         print("lenth of values:",len(val))
46 |         print("length ok keys:",len(keys))
47 |         # Loop through keys to create item
48 |         for i in range(len(val)):
49 |             key = keys[i]
50 |             value = val[i]
51 |             if value is None:
52 |                 value = "5"
53 |             if key == 'Complain_ID':
54 |                 print('yes complaint id')
55 |                 arr[key] = int(value)
56 |             else:
57 |                 arr[key] = value
58 |         print(arr)
59 |         table.put_item(Item = arr)    
60 |     #s3.Object('kotharv-target', 'test.txt').put(Body=payload)
61 |     
62 |        
63 |     
64 |    


--------------------------------------------------------------------------------
/kinesis_to_s3.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import base64
 3 | 
 4 | # Accessing the resource for writing stream data
 5 | s3 = boto3.resource(
 6 |                      's3',
 7 |                       region_name='ap-southeast-2',
 8 |                       aws_access_key_id=<>,
 9 |                       aws_secret_access_key=<>
10 |                       )
11 | ev_lst = []
12 | def lambda_handler(event, context):
13 |     
14 |     for record in event['Records']:
15 |        
16 |        #Kinesis data is base64 encoded so decode here
17 |        payload=base64.b64decode(record["kinesis"]["data"])
18 |        ev_lst.append(payload)
19 | 
20 | # writing data to S3   
21 |     s3.Object('kotharv-target', 'test.txt').put(Body=str(ev_lst))
22 | 


--------------------------------------------------------------------------------
/s3_to_kinesis.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | import json
 3 | #from urllib.parse import unquote_plus
 4 | import time
 5 | import csv
 6 | 
 7 | s3 = boto3.client('s3')
 8 | kinesis = boto3.client('kinesis')
 9 | 
10 | def lambda_handler(event, context):
11 |     if event:
12 |        # Read bucketname, filename from event record
13 |        file_obj = event["Records"][0]
14 |        bucketname = file_obj['s3']['bucket']['name']
15 |        filename = file_obj['s3']['object']['key']
16 |        
17 |        # Get referenc to file on s3 bucket
18 |        fileObj = s3.get_object(Bucket=bucketname, Key=filename)
19 |        
20 |        # Convert the data in file
21 |        file_content = fileObj["Body"].read().decode('utf-8')
22 |       
23 |        # put the record to kinesis
24 |        
25 |        kinesis.put_record(Data=bytes(file_content, 'utf-8'), StreamName='my_fake_stream',PartitionKey='vjfirst')
26 |        
27 | 
28 |     return "thanks"


--------------------------------------------------------------------------------
/web_scrape_jobs.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | link = 'https://www.seek.com.au/data-engineer-jobs-in-information-communication-technology'
 4 | h_class = '_3MPUOLE'
 5 | l_class = '_2e4Pi2B'
 6 | base_url = 'https://www.seek.com.au'
 7 | 
 8 | # extracts data by parsing html class
 9 | def scrape_data(URL,cls):
10 |     page = requests.get(URL)
11 |     soup = BeautifulSoup(page.content, 'html.parser')
12 |     results = soup.find(class_=cls)
13 |     return results
14 | 
15 | # extact skill list
16 | def skill_list(URL,cls):
17 |     page = requests.get(URL)
18 |     soup = BeautifulSoup(page.content, 'html.parser')
19 |     results = soup.find(class_=cls)
20 |     key = results.find_all('ul')
21 |     for key in results.find_all('ul'):
22 |         print(key.text)
23 | 
24 | # extracts job, location, company and skills from data by passing different html class
25 | 
26 | def extact_from_data(cls1,cls2,cls3):
27 |     results = scrape_data(link,h_class)
28 |     for result in results:
29 |     # Each job_elem is a new BeautifulSoup object.
30 |     # You can use the same methods on it as you did before.
31 |         title_elem = result.find('a', class_=cls1)
32 |         location_elem = result.find('strong', class_=cls2)
33 |         company_elem = result.find('a', class_=cls3)
34 |         job_link = base_url+title_elem['href']
35 |         print(title_elem.text.strip())
36 |         print(location_elem.text.strip())
37 |         print(company_elem.text.strip())
38 |         skill_list(job_link,l_class)
39 |         print()
40 | 
41 | extact_from_data('_2iNL7wI','lwHBT6d','_3AMdmRg')


--------------------------------------------------------------------------------