├── .idea
    ├── .gitignore
    ├── Python.iml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── misc.xml
    ├── modules.xml
    └── vcs.xml
├── 01 - Data Engineering Basics
    ├── Bigdata - Hadoop
    │   ├── 01 - Big data Introduction.md
    │   ├── 02 - Introduction to Hadoop.md
    │   ├── 03 -  HDFS and Architecture.md
    │   ├── 04 - Big Data & HDFS Terms.md
    │   ├── 05 - Basic HDFS commands.md
    │   ├── 06 - What is Mapreduce.md
    │   └── README.md
    ├── Cloud Computing
    │   └── README.md
    ├── ETL-DWH-Datalake
    │   ├── 00 - Terms used in Data World.md
    │   ├── 01 - What is ETL Vs ELT.md
    │   ├── 02 - Database Vs Datawarehouse.md
    │   ├── 03 - Datawarehouse Vs Datalake.md
    │   ├── 04 - What is SCD.md
    │   ├── 05 - What is ACID properties.md
    │   ├── 06 - Difference between Snowflake and Databricks
    │   ├── 07 - CAP Theorem.md
    │   └── README.md
    ├── Hive
    │   ├── 01 - Hive Architecture.md
    │   ├── 02 - What is metastore.md
    │   ├── 03 - Hive Schema on Read.md
    │   ├── 04 - Hive Objects.md
    │   ├── 05 - Hive Datatypes.md
    │   ├── 06 - Hive Datamodel.md
    │   ├── 07 - Hive Commands Basics.md
    │   └── README.md
    ├── Python
    │   ├── 01_indentation.py
    │   ├── 02_userinput.py
    │   ├── 03_userinput.py
    │   ├── 04_userinput.py
    │   ├── 05_eval.py
    │   ├── 06_argument.py
    │   ├── 07_A_operators.py
    │   ├── 07_basic_datatypes.py
    │   ├── 08_sequence_datatypes.py
    │   ├── 09_other_datatypes.py
    │   ├── 10_Looping.py
    │   ├── 11_list_comprehension.py
    │   ├── 12_Iterator.py
    │   ├── 13_generator.py
    │   ├── 14_fizzbuzz.py
    │   ├── 15A_class_variables.py
    │   ├── 15B_class_varaibles.py
    │   ├── 15C_class_variables.py
    │   ├── 15D_class_methods.py
    │   ├── 15E_class_methods.py
    │   ├── 15F_class_methods.py
    │   ├── 15_class.py
    │   ├── 16_Inheritance.py
    │   ├── 17A_Polymorphism.py
    │   ├── 17_Polymorphism.py
    │   ├── 18_Encapsulation.py
    │   ├── 19_exception.py
    │   ├── 20A_pandas.py
    │   ├── 20_pandas_csv.py
    │   ├── 21A_semilog_plot.py
    │   ├── 21B_Barchart_plot.py
    │   ├── 21C_PieChart_plot.py
    │   ├── 21D_Piechart_explode.py
    │   ├── 21E_scatter_plot.py
    │   ├── 21F_Piechart_csv.py
    │   ├── 22_selenium.py
    │   ├── 23_DB.py
    │   ├── Calculator.py
    │   ├── input.csv
    │   └── operation.py
    ├── Shell Scripting
    │   ├── 01 - Linux commands.md
    │   └── README.md
    ├── Spark
    │   └── README.md
    ├── Sql
    │   └── README.md
    └── Sqoop
    │   ├── 01 -  What is Sqoop.md
    │   ├── 02 - How Sqoop works.md
    │   ├── 03 - Sqoop import.md
    │   ├── 04 -  Sqoop incremental load.md
    │   ├── 05 - Sqoop export.md
    │   ├── 06 - Sqoop compression.md
    │   ├── 07 - Sqoop job.md
    │   └── README.md
├── 02 - Data Engineering Intermediate
    ├── Airflow
    │   └── README.md
    ├── Hbase
    │   ├── 01 Hbase Datamodel .md
    │   ├── 02 Hbase commands.md
    │   ├── 03 Phoenix Overview.md
    │   ├── 04 Phoenix commands.md
    │   └── README.md
    ├── Hive
    │   └── README.md
    ├── Kafka
    │   ├── 01 What is messaging.md
    │   ├── 02 Kafka Overview.md
    │   ├── 03 Kafka Architecture.md
    │   └── README.md
    ├── Modelling
    │   └── README.md
    ├── Spark
    │   └── README.md
    ├── Tableau
    │   └── README.md
    └── Teradata
    │   └── README.md
├── 03 - Data Engineering Advanced
    ├── AWS
    │   ├── 00 AWS Important Services.md
    │   ├── 01 AWS Compute.md
    │   ├── 02 AWS Storage Services.md
    │   ├── 03 - AWS database services.md
    │   └── README.md
    ├── Azure
    │   └── README.md
    ├── CICD
    │   ├── Git
    │   │   └── README.md
    │   ├── Jenkins
    │   │   └── README.md
    │   └── README.md
    ├── DSA
    │   └── README.md
    ├── Databricks
    │   └── README.md
    ├── Design Principles
    │   └── README.md
    ├── GCP
    │   └── README.md
    ├── NOSQL
    │   ├── CosmoDB
    │   │   └── README.md
    │   ├── DynamoDB
    │   │   └── README.md
    │   └── README.md
    ├── Orchestration
    │   └── README.md
    ├── Performance Tuning
    │   └── README.md
    ├── Snowflake
    │   └── README.md
    └── Streaming
    │   └── README.md
├── 04 - Data Engineering Projects
    └── README.md
├── 05 - Interview Questions
    ├── 01 - Interview Questions.md
    ├── 02- Interview questions.md
    └── README.md
├── 06 - Productivity Tips
    └── README.md
├── 07 - DE Best Linkedin Posts Links
    ├── 01 Best Linkedin Post for DE.md
    └── README.md
├── 08 - DE Best Youtube Channels Links
    ├── 01 - Youtube Links.md
    └── README.md
├── 09 - DE Best Books Lists & summary
    ├── 01 Best Books Lists for DE.md
    └── README.md
├── 10 - Data Engineering Certifications
    └── README.md
├── 11 - DE Best Articles Links
    ├── 01 AWS Reference.md
    └── README.md
├── 12 - DE Important Reference Documents
    ├── Azure data lake.pdf
    ├── DSA.pdf
    └── Database design concepts.pdf
└── README.md


/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml


--------------------------------------------------------------------------------
/.idea/Python.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="pytest" />
10 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Python.iml" filepath="$PROJECT_DIR$/.idea/Python.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Bigdata - Hadoop/01 - Big data Introduction.md:
--------------------------------------------------------------------------------
 1 | **Why BigData?**
 2 | 
 3 |    We can't process huge amount of data classified under above 4V'S(Volume, Velocity, Variety and Veracity) in traditional systems
 4 |    To process the data first we need to find a way to store them.
 5 |    Where do we store such huge amount of data?
 6 |        Can we store and process 1 TB of data if I have storage capacity of 500 GB in single machine? No
 7 |        But same 1 TB of can be stored on 10 machines(100 GB each) and process subsequently. This is nothing but distributed system.
 8 | 
 9 |        Before Big Data Era -  It is storage and just process in a single machine/server
10 |        After Big Data Era  -  It is distributed storage and distributed processing in cluster(group of machine)
11 |        
12 |    Traditional way of scaling is vertical scaling. Eg: Any Relational Database
13 |    
14 |    For the distributed storage and processing - horizontal scaling(True scaling)
15 |    
16 |    In horizontal scaling, the number of resources increased directly result in increasing the performance.
17 | 
18 | **Big Data Requirements:**
19 | 
20 | Store --> Process -> Scale
21 | Store -  store massive amount of data
22 | Process - Process it in a timely manner
23 | Scale - Scale easily as data grows
24 | 
25 | **Scalability:**
26 | Two ways to build a system 
27 | Monolithic - A powerful system with lot of resources
28 | Distributed - Many smaller systems(nodes) comes together 
29 |                       Each system is node and together is cluster
30 | 
31 | Monolithic:
32 |  A single powerful server
33 | Hard to add resources after a certain limit
34 | 
35 | Resources means - 
36 | RAM - 8 GB(Memory)
37 | Hard Disk - 1 TB(Storage)
38 | CPU Quad core (Compute)
39 | 
40 | Monolithic Architecture -  Vertical Scaling(No True Scaling)
41 | Distributed Architecture - Horizontal Scaling(True scaling)
42 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Bigdata - Hadoop/02 - Introduction to Hadoop.md:
--------------------------------------------------------------------------------
 1 | **What is Hadoop?**
 2 | 
 3 |       Hadoop is an open-source software framework written in Java for distributed storage and distributed processing of very large dataset on group of machines.
 4 |       Its a framework to solve big data problems.
 5 | 
 6 | **Hadoop Versions:**
 7 | 
 8 |       1.0 - Hadoop Distributed File System(HDFS) , Map Reduce(MR)
 9 |       2.0 - HDFS , Map Reduce , Yarn Resource Management(YARN)
10 |       3.0 - Current with above components
11 | 
12 | **Hadoop Basic components:**
13 | 
14 |       Storage          -->   Hadoop Distributed File System (HDFS) – a distributed file-system that stores 
15 |                        data on commodity machines, providing very high aggregate bandwidth across the cluster;
16 |       Processing       -->   MapReduce – an implementation of the MapReduce programming 
17 |                        model for large scale data processing
18 |       Resource Manager -->   Hadoop YARN – a resource-management platform responsible for managing 
19 |                        computing resources in clusters and using them for scheduling of users' applications
20 | 
21 | **Hadoop Ecosystem:**
22 | 
23 |       Hive        - Datawarehouse tool for data analysis.
24 |                     Don't know Java and know SQL. Write code in SQL and this gets converted into MR internally and submitted into cluster. 
25 |       Sqoop       - CLI that transfers data between relation databases(oracle) and Hadoop
26 |       Pig         - Scripting language for data manipulation(clean the data) and convert unstructured data into structured.
27 |       Hbase       - No SQL DB runs on top of HDFS.
28 |       Oozie       - Scheduler to manage the jobs.
29 |       Spark       - Distributed general purpose in-memory compute engine
30 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Bigdata - Hadoop/03 -  HDFS and Architecture.md:
--------------------------------------------------------------------------------
 1 | **HDFS Architecture**
 2 | 
 3 | ![image](https://user-images.githubusercontent.com/42135673/232320089-dc36a589-4a02-4efd-babe-3786b81d5f06.png)
 4 | 
 5 | Name Node - Master
 6 | 
 7 | Data Nodes - Slaves where data is stored
 8 | 
 9 | Blocks - Breaking down the file into multiple parts as per block size
10 | 
11 |     Block Size - 128 MB (With this size, each file is broken down into multiple chunks(blocks)
12 | 
13 | Meta Data - Namenode stores the information which says about each blocks locations in data node
14 | 
15 | Data Nodes are made of commodity hardwares and Name nodes are made of high quality hardwares
16 | 
17 | Datanodes store actual data in blocks and perform actual processing
18 | 
19 | Replication - Copying the individual blocks into different racks.
20 | 
21 | Racks - Group of machines in same network or geography
22 | 
23 | Client(Laptop) submits a job/ Request --> Requests goes to Namenode --> Namenode sends back metadata info --> Client reads data from blocks of data node based on metadata
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Bigdata - Hadoop/04 - Big Data & HDFS Terms.md:
--------------------------------------------------------------------------------
 1 | Semi Structured data -  Json
 2 | 
 3 | Unstructured data - Logs, Images, Audio & Videos
 4 | 
 5 | Monolithic - A powerful system with lot of resources. Hard to add resources after a certain limit
 6 | 
 7 | Distributed - Many smaller systems grouped together. Each system is node and entire group is cluster
 8 | 
 9 | Horizontal Scaling - 
10 | Vertical Scaling - 
11 | 
12 | Resources means - RAM - 16 GB(Memory) , Hard Disk - 2 TB(Storage) & CPU Quad core (Compute)
13 | 
14 | Distributed Storage -  HDFS
15 | Distributed Processing -  SPARK
16 | Resource Management - YARN
17 | 
18 | Data Ingestion -  Sqoop transfers data between Relational Database and Hadoop
19 | 
20 | Data Query/Analysis -  DW tool for data analysis
21 | 
22 | Name Node - Master 
23 | 
24 | Data Nodes - Slaves where data is stored
25 | 
26 | Blocks - Breaking down the file into multiple parts as per block size
27 | 
28 | Block Size  - 128 MB  (With this size, each file is broken down into multiple chunks(blocks)
29 | 
30 | Meta Data - Namenode stores the information which says about each blocks locations in data node
31 | 
32 | Parity blocks in Hadoop 3
33 | 
34 | Data Nodes are made of commodity hardwares
35 | 
36 | However Name nodes are made of high quality hardwares
37 | 
38 | Failure Management:
39 | 
40 | Data Node:
41 | 
42 | 	Fault Tolerance (Replication Factor Default 3 and can be changed)Name node will create one more copy to maintain the replication factor 
43 |   when any data node is down with help of Heartbeat signal.
44 |   
45 | NameNode:
46 |            SPOF - Single Point of Failure. In current version of hadoop, Namenode is no more SPOF
47 |            
48 |            fsimage -  snapshot of in-memory files at given time
49 |            
50 |            edit logs - all new changes after above snapshot taken
51 |            
52 |            Checkpointing -  Merging of both fsimage and edit logs to keep fsimage updated
53 | 
54 | Who does checkpointing -  Secondary Name Node 
55 | 
56 |    Client(Laptop) submits a job/ Request  --> Requests goes to Namenode --> Namenode sends back metadata info -->
57 |    Client reads data from blocks of data node based on metadata 
58 | 
59 | Rack Awareness Mechanisms:
60 |   The balance approach is to place replicas in two different racks.
61 |   
62 |   One replica in one rack and other two in a different rack or vice versa
63 | 
64 | Block Report
65 |  Each data nodes sends a block report to the name node at a fixed frequency indication if any blocks are corrupted
66 |  
67 | Name Node High Availability:
68 | 
69 |    Active name nodes  sends the edit logs to the journal nodes and journal nodes keep updating the passive name nodes with latest edit logs.
70 |    
71 |    When Active name nodes goes down, passive name node takes over.
72 |    
73 |      Failure of (N-1)/2.
74 |      
75 |      Means 5 Nodes : (5-1)/2 ; 2 nodes can be failure
76 |      
77 |      In latest version of hadoop, we have can more than one name node.
78 |      
79 | Name node Scalability:
80 | 
81 |     We have previously discussed that about Secondary name node but it is for high availability and not to share the load.
82 |     
83 |     SO, we can have multiple name nodes which distribute or share the load.
84 |     
85 |     Different namenode can handle different name spaces
86 | 
87 | Name Node Federation:
88 | 
89 |     This concept scaling of namenode, dividing the load among name nodes is called as Name node federation
90 | 
91 | Gateway/Edge node  --> Where user logs in and access the cluster
92 | 
93 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Bigdata - Hadoop/05 - Basic HDFS commands.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Lists the files from the directory:
 3 | 
 4 |       Hadoop fs -ls /
 5 | 
 6 | Create a directory:
 7 | 
 8 |     Hadoop fs -mkdir /user/jaga/
 9 | 
10 | Create a parent directory even if doesn't exist
11 | 
12 |     Hadoop fs -mkdir -p /user/jaga/data
13 | 
14 | Delete the file
15 | 
16 |     Hadoop fs -rm /user/jaga/data/file1.txt
17 | 
18 | Delete the files from directory recursively
19 | 
20 |     Hadoop fs -rm -R /user/jaga/data
21 | 
22 | Copy the file from local system to hdfs
23 | 
24 |     hadoop fs -copyFromLocal Desktop/file1.txt /user/jaga/data
25 | 
26 | Copy the file from hdfs to local system
27 | 
28 |     hadoop fs -copyToLocal <hdfspath> <localpath>
29 | 
30 | Copy the file from hdfs source to target directory
31 |   
32 |     Hadoop fs -cp <hdfs source file path>  <hdfs target directory>
33 | 
34 | Move the file from hdfs source to target directory
35 |   
36 |     Hadoop fs -mv <hdfs source file path>  <hdfs target directory>
37 |     
38 | Check the disk usage in HDFS:
39 |   
40 |     Hadoop fs -df -h /user/cloudera   --> disk free
41 |   
42 |     Hadoop fs -du -h /user/cloudera  --> disk usage
43 |   
44 |     Hadoop fs -du -s -h  /user/cloudera  --> summary
45 | 
46 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Bigdata - Hadoop/06 - What is Mapreduce.md:
--------------------------------------------------------------------------------
1 | **The MapReduce Framework**
2 | 
3 | To process these large data files by translating the overall workload into a directed acyclic graph (DAG) and 
4 | that by distributing the processing code to the data nodes that will transform (map) and subsequently merge the result data (reduce).
5 | 
6 | 
7 | ![image](https://user-images.githubusercontent.com/42135673/232807674-838a1253-d407-4b49-aa47-a7567b25ff22.png)
8 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Bigdata - Hadoop/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/01 - Data Engineering Basics/Bigdata - Hadoop/README.md


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Cloud Computing/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/01 - Data Engineering Basics/Cloud Computing/README.md


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/ETL-DWH-Datalake/00 - Terms used in Data World.md:
--------------------------------------------------------------------------------
 1 | A Data Warehouse works as a central repository where information arrives from one or more data sources. Data flows into a data warehouse from the transactional system and other relational databases.
 2 | 
 3 | Data may be:
 4 | 
 5 | 	1. Structured
 6 | 	2. Semi-structured
 7 | 	3. Unstructured data
 8 |   
 9 | The data is processed, transformed, and ingested so that users can access the processed data in the Data Warehouse through Business Intelligence tools, SQL clients, and spreadsheets. A data warehouse merges information coming from different sources into one comprehensive database
10 | 
11 | Types of Data Warehouse
12 | 
13 | Three main types of Data Warehouses are:
14 | 
15 | 1. Enterprise Data Warehouse:
16 | 
17 | Enterprise Data Warehouse is a centralized warehouse. It provides decision support service across the enterprise. It offers a unified approach for organizing and representing data. It also provide the ability to classify data according to the subject and give access according to those divisions.
18 | 
19 | 2. Operational Data Store:
20 | Operational Data Store, which is also called ODS, are nothing but data store required when neither Data warehouse nor OLTP systems support organizations reporting needs. In ODS, Data warehouse is refreshed in real time. Hence, it is widely preferred for routine activities like storing records of the Employees.
21 | 
22 | 3. Data Mart:
23 | A data mart is a subset of the data warehouse. It specially designed for a particular line of business, such as sales, finance, sales or finance. In an independent data mart, data can collect directly from sources.
24 | 
25 | General stages of Data Warehouse
26 | 
27 | Earlier, organizations started relatively simple use of data warehousing. However, over time, more sophisticated use of data warehousing begun.
28 | 
29 | The following are general stages of use of the data warehouse:
30 | 
31 | Offline Operational Database:
32 | In this stage, data is just copied from an operational system to another server. In this way, loading, processing, and reporting of the copied data do not impact the operational system's performance.
33 | 
34 | Offline Data Warehouse:
35 | Data in the Datawarehouse is regularly updated from the Operational Database. The data in Datawarehouse is mapped and transformed to meet the Datawarehouse objectives.
36 | 
37 | Real time Data Warehouse:
38 | In this stage, Data warehouses are updated whenever any transaction takes place in operational database. For example, Airline or railway booking system.
39 | 
40 | Integrated Data Warehouse:
41 | In this stage, Data Warehouses are updated continuously when the operational system performs a transaction. The Datawarehouse then generates transactions which are passed back to the operational system.
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/ETL-DWH-Datalake/01 - What is ETL Vs ELT.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/ETL-DWH-Datalake/02 - Database Vs Datawarehouse.md:
--------------------------------------------------------------------------------
1 | ![image](https://user-images.githubusercontent.com/42135673/232536957-88e42c58-e95f-4f08-bed9-de73d144fc07.png)
2 | 
3 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/ETL-DWH-Datalake/03 - Datawarehouse Vs Datalake.md:
--------------------------------------------------------------------------------
1 | ![image](https://user-images.githubusercontent.com/42135673/232546049-860a7448-266e-4658-ab83-105f1e60ce56.png)
2 | 
3 | ![image](https://user-images.githubusercontent.com/42135673/232546467-8e95ab8a-0d39-4ca1-b2a1-81c3a54b38a8.png)
4 | 
5 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/ETL-DWH-Datalake/04 - What is SCD.md:
--------------------------------------------------------------------------------
 1 | What are slowly changing dimensions?
 2 | 
 3 | When organising a datawarehouse into Kimball-style star schemas, you relate fact records to a specific dimension record with its related attributes. But what if the information in the dimension changes? Do you now associate all fact records with the new value? Do you ignore the change to keep historical accuracy? Or do you treat facts before the dimension change differently to those after?
 4 | It is this decision that determines whether to make your dimension a slowly changing one. There are several different types of SCD depending on how you treat incoming change.
 5 | 
 6 | We have a very simple ‘customer’ dimension, with just 2 attributes – Customer Name and Country:
 7 | 
 8 | ![image](https://user-images.githubusercontent.com/42135673/232538232-81168a0f-bfc5-422d-ae4a-b438305f829e.png)
 9 | 
10 | 
11 | • Type 0 – Fixed Dimension
12 | No changes allowed, dimension never changes
13 | • Type 1 – No History
14 | Update record directly, there is no record of historical values, only current state
15 | 
16 | ![image](https://user-images.githubusercontent.com/42135673/232538298-9ffe306e-22c4-48be-951d-cabbaee18c4f.png)
17 | 
18 | 
19 | • Type 2 – Row Versioning
20 | Track changes as version records with current flag & active dates and other metadata
21 | In order to support type 2 changes, we need to add four columns to our table:
22 | · Surrogate Key – the original ID will no longer be sufficient to identify the specific record we require, we therefore need to create a new ID that the fact records can join to specifically.
23 | · Current Flag – A quick method of returning only the current version of each record
24 | · Start Date – The date from which the specific historical version is active
25 | · End Date – The date to which the specific historical version record is active
26 | With these elements in place, our table will now look like:
27 | 
28 | ![image](https://user-images.githubusercontent.com/42135673/232538387-42653281-b1ff-437a-b558-3fdd50f663b0.png)
29 | 
30 | 
31 | This method is very powerful – you maintain the history for the entire record and can easily perform change-over-time analysis. However, it also comes with more maintenance overhead, increased storage requirement and potential performance impacts if used on very large dimensions.
32 | Type 2 is the most common method of tracking change in data warehouses.
33 | 
34 | • Type 3 – Previous Value column
35 | Track change to a specific attribute, add a column to show the previous value, which is updated as further changes occur
36 | Here, we add a new column called “Previous Country” to track what the last value for our attribute was.
37 | 
38 | ![image](https://user-images.githubusercontent.com/42135673/232538443-2c7e8818-5e81-4fad-bb4e-b328149f4ee0.png)
39 | 
40 | 
41 | Note how this will only provide a single historical value for Country. If the customer changes his name, we will not be able to track it without adding a new column. Likewise, if Bob moved country again, we would either need to add further “Previous Previous Country” columns or lose the fact that he once lived in the United Kingdom.
42 | 
43 | • Type 4 – History Table
44 | Show current value in dimension table but track all changes in separate table
45 | There is no change to our existing table here, we simply update the record as if a Type 1 change had occurred. However, we simultaneously maintain a history table to keep track of these changes:
46 | Our Dimension table reads:
47 | 
48 | ![image](https://user-images.githubusercontent.com/42135673/232538506-7f5e5b8f-a208-479c-8ece-4d5b7c6a96de.png)
49 | 
50 | Whilst our Type 4 historical table is created as:
51 | 
52 | ![image](https://user-images.githubusercontent.com/42135673/232538558-2a01302e-57f0-4282-87a4-94759d627a5d.png)
53 | 
54 | Depending on your requirements, you may place both ID and Surrogate Key onto the fact record so that you can optimise performance whilst maintaining functionality.
55 | Separating the historical data makes your dimensions smaller and therefore reduces complexity and improves performance if the majority of uses only need the current value.
56 | However, if you do require historical values, this structure adds complexity and data redundancy overheads. It is generally assumed that the system will use Type 1 or Type 2 rather than Type 4.
57 | 
58 | • Type 6 – Hybrid SCD
59 | Utilise techniques from SCD Types 1, 2 and 3 to track change
60 | The ‘Hybrid’ method simply takes SCD types 1, 2 and 3 and applies all techniques. We would maintain a history of all changes whilst simultaneously updating a “current value” column on all records.
61 | 
62 | ![image](https://user-images.githubusercontent.com/42135673/232538629-edff6039-d3d3-40c1-a229-e9089ab11e23.png)
63 | 
64 |  
65 | This gives you the ability to provide an element of change comparison without additional calculation, whilst still maintaining a full, detailed history of all changes in the system.
66 | Personally, if this requirement came up, I would avoid the data redundancy of this extra column and simply calculate the current value using the “LAST_VALUE()” window function at run-time. Although this depends on your priorities between data storage and direct querying performance.
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/ETL-DWH-Datalake/05 - What is ACID properties.md:
--------------------------------------------------------------------------------
 1 | What are the 𝗔𝗖𝗜𝗗 𝗣𝗿𝗼𝗽𝗲𝗿𝘁𝗶𝗲𝘀 𝗼𝗳 𝗗𝗕𝗠𝗦
 2 | 
 3 | Transaction is a sequence of steps performed on a database as a single logical unit of work
 4 | 
 5 | The ACID database transaction model ensures that a performed transaction is always consistent by ensuring
 6 |  
 7 | - 𝗔𝘁𝗼𝗺𝗶𝗰𝗶𝘁𝘆 - Each transaction is either properly carried out or the database reverts back to the state before the transaction started
 8 | - 𝗖𝗼𝗻𝘀𝗶𝘀𝘁𝗲𝗻𝗰𝘆 - The database must be in a consistent state before and after the transaction
 9 | - I𝘀𝗼𝗹𝗮𝘁𝗶𝗼𝗻 - Multiple transactions occur independently without interference
10 | - 𝗗𝘂𝗿𝗮𝗯𝗶𝗹𝗶𝘁𝘆 - Successful transactions are persisted even in the case of system failure
11 | 
12 |  ACID guarantees will be ensured by the most Relational Databases such as MySQL , PostgreSQL
13 |  
14 |  NoSQL databases not follows ACID instead BASE transaction model which leads eventual consistency. Eg: Cassandra, MongoDB
15 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/ETL-DWH-Datalake/06 - Difference between Snowflake and Databricks:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 𝗦𝗻𝗼𝘄𝗙𝗹𝗮𝗸𝗲:
 4 | 
 5 | This is a cloud-based data warehouse as a service. 
 6 | They provide ELT support mainly through their COPY command and dedicated schema and file object definitions. 
 7 | It allows you to easily spin up multiple independent compute clusters that can operate on the data simultaneously from a single copy of the data. 
 8 | In terms of data engineering, they follow the ELT method. 
 9 | Nevertheless, they offer good support for 3rd party ETL tools such as fivetran, talend, etc. The software even allows you to install DBT.
10 | 
11 | 𝗗𝗮𝘁𝗮𝗯𝗿𝗶𝗰𝗸𝘀:
12 | 
13 | Processing power is the main function of data bricks. Spark's core functionality is integrated and it is ideal for ETL loads.
14 | The storage they use is called a data lakehouse, which is similar to a data lake but has relational database functionality. 
15 | This is basically a data lake, but you can run SQL on it, which has become quite popular lately.
16 | 
17 | You only need to worry about loading your data into snowflake if you have an existing ETL tool like fivetran, talend, tibco, etc.
18 | All of your database infrastructure (paritioning, scalability, indexes, etc.) is being handled for you.
19 | 
20 | Consider databricks if you don't have an existing ETL tool and your data requires intensive cleaning and has unpredictable data sources and schemas.
21 | Take advantage of the schema on read technique to scale your data.
22 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/ETL-DWH-Datalake/07 - CAP Theorem.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/ETL-DWH-Datalake/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/01 - Data Engineering Basics/ETL-DWH-Datalake/README.md


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Hive/01 - Hive Architecture.md:
--------------------------------------------------------------------------------
1 | ![image](https://user-images.githubusercontent.com/42135673/232808462-ca0e90ee-9ee5-49cb-a2e6-4352da7946f5.png)
2 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Hive/02 - What is metastore.md:
--------------------------------------------------------------------------------
 1 | 
 2 | - The Metastore is the system catalog which contains metadata about the tables stored in Hive. 
 3 | 
 4 | - Holds table / namespace definitions (column types, physical layout)
 5 | 
 6 | - Holds partitioning information
 7 | 
 8 | - Can be stored in Derby, MySQL, and many other relational databases
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Hive/03 - Hive Schema on Read.md:
--------------------------------------------------------------------------------
 1 | - Hive uses table schemata at read time thus enabling flexible data handling.
 2 | 
 3 | - Hive doesn't verify the data when it is loaded, but rather when a query is issued.
 4 | 
 5 | - Schema on read makes for a very fast initial load, since the data does not have to be read, parsed, and serialized to disk in the database's internal format. 
 6 | 
 7 | - The load operation is just a file copy or move. 
 8 | 
 9 | - It is more flexible, too: consider having two schemas for the same underlying data, depending on the analysis being performed. 
10 | 
11 | - Furthermore, there are many scenarios where the schema is not known at load time, so there are no indexes to apply, because the queries have not been formulated yet. 
12 | 
13 | 
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Hive/04 - Hive Objects.md:
--------------------------------------------------------------------------------
 1 | ![image](https://user-images.githubusercontent.com/42135673/232810358-d3d98eee-6d38-4a68-b190-c8f896f12a8e.png)
 2 | 
 3 | 
 4 | **Databases**
 5 | 
 6 |   Namespaces that separate tables and other data units from naming conflicts. 
 7 | 
 8 | **Tables**
 9 | 
10 |   Homogeneous units of data which have the same schema.
11 |   
12 |     Managed Tables:
13 |         When you create a table in Hive, by default Hive will manage the data, which means that Hive moves the data into its warehouse directory. 
14 |     External Tables:
15 |         Alternatively, you may create an external table, which tells Hive to refer to the data that is at an existing location outside the warehouse directory. 
16 |  
17 |  **Partitions**
18 |  
19 | Each Table can have one or more Partition keys which determine how the data is stored. 
20 | Partitions - apart from being storage units – can be used inside queries just like regular columns.
21 | 
22 | **Buckets**
23 | 
24 | Data in each partition may in turn be divided into Buckets based on the value of a hash function of some column of the Table. 
25 | Buckets can be used to efficiently sample the data.
26 | Note that buckets do not require Partitions.
27 | 
28 | **Files**
29 | 
30 | The data itself can be stored in several different file formats:
31 | 
32 |   - TEXTFILE (default)
33 |   - SEQUENCEFILE (binary)
34 |   - RCFILE (column-oriented)
35 |   - ORC (opt. column-oriented)
36 |   - PARQUET (Apache)
37 |   - AVRO (Apache)
38 | 
39 | **Records/Rows**
40 | 
41 | Single records respectively rows in tables are read/written using so-called SerDe’s (Serializer/De-serializer).
42 | They parse raw byte input into records/rows.
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Hive/05 - Hive Datatypes.md:
--------------------------------------------------------------------------------
 1 | **Primitive Datatypes**
 2 | 
 3 |    - TINYINT, SMALLINT, INT, BIGINT    -  Integer number (1,2,4,8 byte length)
 4 |    - BOOLEAN                           -  Boolean values (TRUE/FALSE)
 5 |    - FLOAT, DOUBLE                     -  Floating point numbers (single / double precision)
 6 |    - DECIMAL(precision, scale)         -  Floating point numbers, 17 bytes precision, up to 38 digits 
 7 |    - TIMESTAMP                         -  Timestamp in format yyyy-mm-dd hh:mm:ss
 8 |    - DATE                              -  Date in format YYYY-­MM-­DD 
 9 |    - STRING                            -  sequence of characters in a specified character set
10 |    - VARCHAR                           -  SQL VARCHAR, sequence of characters  with variable length, maximum length in braces
11 |    - CHAR                              -  SQL CHAR, sequence of characters with fixed length + padding with spaces
12 |    - BINARY                            -  Array of bytes
13 |  
14 |  **Complex Datatypes**
15 |  
16 |    - ARRAY                             - The elements in the array have to be in the same type. Elements can be accessed using the [n] notation 
17 |                                          where n is an index (zero-based) into the array.                    
18 |    - MAP                                - The elements are accessed using ['element name'] notation
19 |    - STRUCT                             - The elements within the type can be accessed using the DOT (.) notation. 
20 | 
21 |  
22 |  
23 | 
24 | 
25 |   
26 |  
27 | 
28 | 
29 |  
30 | 
31 | 
32 |   
33 |  
34 | 
35 | 
36 | 
37 |   
38 |  
39 | 
40 | 
41 | 
42 |   
43 |  
44 | 
45 | 
46 | 
47 |   
48 |  
49 | 
50 | 
51 | 
52 |   
53 |  
54 | 
55 | 
56 | 
57 |   
58 |  
59 | 
60 | 
61 | 
62 |   
63 |  
64 | 
65 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Hive/06 - Hive Datamodel.md:
--------------------------------------------------------------------------------
 1 | Most of the table structures in Hive are directly mapped to HDFS:
 2 | 
 3 |     - Warehouse directory in HDFS /apps/hive/warehouse/
 4 | 
 5 |     - Tables stored in subdirectories of warehouse
 6 |     
 7 |     - Partitions form subdirectories of tables
 8 | 
 9 |     - Actual data stored in flat files
10 |  
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Hive/07 - Hive Commands Basics.md:
--------------------------------------------------------------------------------
  1 | Hadoop dfs commands:
  2 | ================
  3 | dfs -ls /hivedemos
  4 | 
  5 | Comments in Hive:
  6 | ==============
  7 | -- This is script
  8 | 
  9 | Databases in hive:
 10 | =================
 11 | 
 12 | Default database is default database.
 13 | 
 14 | To create database:
 15 | ===================
 16 | create database student;
 17 | 
 18 | To suppress already existing database warnings:
 19 | ===================================
 20 | create database IF NOT EXISTS student;
 21 | 
 22 | Note: You can also use the keyword SCHEMA instead of database.
 23 | 
 24 | To see databases:
 25 | ===============
 26 | Show databases;
 27 | 
 28 | Hive ==> create a directory for each database.
 29 | 
 30 | Tables in that database will be stored in sub directories of the database directory.
 31 | 
 32 | Note: The database directory is created under hive.metastore.warehouse.dir.
 33 | 
 34 | default location: /user/hive/warehouse/student.db
 35 | 
 36 | To add comment for database:
 37 | ======================
 38 | create database student comment 'Holds all student tables';
 39 | 
 40 | To describe database:
 41 | ====================
 42 | describe database student;
 43 | 
 44 | Note: Also shows the directory location for the database.
 45 | 
 46 | To associate key-value properties with the database:
 47 | ===================================================
 48 | create database student WITH DBPROPERTIES ('creator' = 'subhashini' , 'date' = '2015-1-27');
 49 | 
 50 | describe database EXTENDED student;
 51 | 
 52 | Use command:
 53 | ===========
 54 | sets a database as your working database.
 55 | 
 56 | use student;
 57 | 
 58 | Note: there is no command to show you which databse is your current working database.
 59 | 
 60 | To drop database:
 61 | ================
 62 | drop database if exists student;
 63 | 
 64 | 
 65 | To Alter Database:
 66 | =============
 67 | Note: You can set key-value pairs in the DBPROPERTIES associated with a databse using Alter Database command. No other metadata about the database can be changed including its name and directory location.
 68 | 
 69 | alter database student SET DBPROPERTIES ('edited-by' = 'Subha');
 70 | 
 71 | Note: There is no way to delete or unset a DBPROPERTY.
 72 | 
 73 | Managing Tables in Hive:
 74 | =======================
 75 | Two kinds of tables:
 76 | 
 77 | 1. Managed Table
 78 | 
 79 | 2. External Table
 80 | 
 81 | ====================================================================================================================
 82 | 
 83 | Dataset: Student.tsv
 84 | 
 85 | 1001	John	45.0
 86 | 1002	James	85.0
 87 | 1003	John	45.0
 88 | 1004	James	85.0
 89 | 1005	Smith	60.0
 90 | 1006	Scott	70.0
 91 | 1007	Shoba	80.0
 92 | 1008	Taanu	90.0
 93 | 1009	Anbu	95.0
 94 | 1010	Aruna	85.0
 95 | 
 96 | Managed Table:
 97 | =============
 98 | 
 99 | create table IF NOT EXISTS student_int (studID INT COMMENT 'Student ID',studName STRING, gpa FLOAT)
100 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
101 | 
102 | create table IF NOT EXISTS student(studID INT COMMENT 'Student ID',studName STRING, gpa FLOAT)
103 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
104 | 
105 | Note: Stored in warehouse folder. When you drop an internal table, it drops the data, and it also drops the metadata.
106 | 
107 | External Table:
108 | ==============
109 | 
110 | create EXTERNAL table IF NOT EXISTS student_ext (studID INT COMMENT 'Student ID',studName STRING, gpa FLOAT)
111 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
112 | LINES TERMINATED BY '\n'
113 | STORED AS TEXTFILE
114 | LOCATION '/student_information';
115 | 
116 | Note: Create a new directory.
117 | 
118 | Note: When you drop an external table, it only drops the meta data. That means hive is ignorant of that data now. It does not touch the data itself.
119 | 
120 | Data Manipualtion in Hive:
121 | =========================
122 | Loading Data to Hive Tables:
123 | ===========================
124 | LOAD DATA INPATH '/hivedemos/student.tsv' INTO TABLE student_int;
125 | 
126 | LOAD DATA LOCAL INPATH '/home/vagrant/bigdata/pigdemos/student.tsv' INTO TABLE student_int;
127 | 
128 | LOAD DATA LOCAL INPATH '/home/vagrant/bigdata/hivedemos/student.tsv' INTO TABLE student_ext;
129 | 
130 | Complex Data Types:
131 | ==================
132 | student.csv
133 | ==========
134 | John Smith,80.0,Joshi:Jack,English!24:EVS!25:Hindi!23
135 | scott Tiger,90.0,James:Aruna,English!25:EVS!25:Hindi!22 
136 |  
137 | create table student_complex(name String,gpa FLOAT, classmates ARRAY<STRING>,marks MAP<STRING,INT>) 
138 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
139 | COLLECTION ITEMS TERMINATED BY ':'
140 | MAP KEYS TERMINATED BY '!';
141 | 
142 | Describe student_complex;
143 | 
144 | LOAD DATA LOCAL INPATH '/home/vagrant/bigdata/pigdemos/student.csv' INTO TABLE student_complex;
145 | 
146 | select * from student_complex;
147 | 
148 | select name, classmates from student_complex;
149 | 
150 | select name, marks from student_complex;
151 | 
152 | select name, marks['English']  from student_complex;
153 | 
154 | Order by:
155 | ========
156 | SELECT s.studID, s.studName,s.gpa                    
157 | FROM student s
158 | ORDER BY s.studName DESC;
159 | 
160 | select * from metastore.TBLS join metastore.COLUMNS_V2 on TBLS.TBL_ID=COLUMNS_V2.CD_ID;
161 | 
162 | 
163 | Partition Table:
164 | ===============
165 |   
166 | 1. Static Partition 
167 | 2. Dynamic Partition
168 | 
169 | Static Partition:
170 | ================
171 | Static Partition (SP) columns:The columns whose values are known at COMPILE TIME (given by user).
172 | 
173 | CREATE TABLE student_sp(studID INT, studName STRING COMMENT 'Student Name')
174 | PARTITIONED BY (gpa FLOAT);
175 | 
176 | INSERT OVERWRITE TABLE student_sp
177 | PARTITION (gpa=45.0)
178 | select studID,studName from student
179 | where gpa=45.0;
180 | 
181 | show partitions student_sp;
182 | 
183 | alter table student_sp add partition(gpa=85.0);
184 | 
185 | INSERT OVERWRITE TABLE student_sp
186 | PARTITION (gpa=85.0)
187 | select studID,studName from student where gpa=85.0;
188 | 
189 | Dynamic Partition:
190 | =================
191 | 
192 | CREATE TABLE student_dp(studID INT, studName STRING COMMENT 'Student Name')
193 | PARTITIONED BY (gpa FLOAT);
194 | 
195 | SET hive.exec.dynamic.partition = true; -- enable dynamic partition
196 | SET hive.exec.dynamic.partition.mode = nonstrict; - to disable static partition requirmnent for dp
197 | 
198 | Note: Dynamic partition strict mode requires at least one static partition column. To turn this off set hive.exec.dynamic.partition.mode=nonstrict
199 | 
200 | INSERT OVERWRITE TABLE student_dp
201 | PARTITION (gpa)
202 | select studID,studName,gpa from student;
203 | 
204 | select * from student_dp where gpa=45.0;
205 | 
206 | show partitions student_dp;
207 | 
208 | select * from student_dp where gpa=45.0;
209 | 
210 | Drop Table:
211 | ==========
212 | 
213 | Drop table [IF EXISTS] student;
214 | 
215 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Hive/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/01 - Data Engineering Basics/Hive/README.md


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/01_indentation.py:
--------------------------------------------------------------------------------
 1 | #indentation
 2 | a=1
 3 | print(a)
 4 | x=True
 5 | if x:
 6 |     print('false')
 7 |     print('apple')
 8 | print('suresh')
 9 | print('true')
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/02_userinput.py:
--------------------------------------------------------------------------------
1 | #User Input
2 | x=input("Enter 1st number")
3 | y=input("Enter 2nd number")
4 | z=x+y
5 | print("sum of 2 number is: ",z)
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/03_userinput.py:
--------------------------------------------------------------------------------
1 | #user input
2 | x=int(input("Enter 1st number"))
3 | y=int(input("Enter 2nd number"))
4 | z=x+y
5 | print("sum of 2 number is: ",z)
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/04_userinput.py:
--------------------------------------------------------------------------------
1 | #user input
2 | ch=input('enter a char')
3 | print(ch)
4 | ch=input('enter a char')[0]
5 | print(ch)
6 | 
7 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/05_eval.py:
--------------------------------------------------------------------------------
1 | #eval
2 | result=eval(input('enter an expression'))
3 | print(result)


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/06_argument.py:
--------------------------------------------------------------------------------
1 | #-- argument
2 | import sys
3 | x=int(sys.argv[1])
4 | y=int(sys.argv[2])
5 | z=x+y
6 | print(z)


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/07_A_operators.py:
--------------------------------------------------------------------------------
 1 | import pdb
 2 | 
 3 | #----------------Logical Operator example 1
 4 | a = 10
 5 | b = 10
 6 | c = -10
 7 | 
 8 | if a > 0 and b > 0:
 9 |     print("The numbers are greater than 0")
10 | 
11 | if a > 0 and b > 0 and c > 0:
12 |     print("The numbers are greater than 0")
13 | else:
14 |     print("Atleast one number is not greater than 0")
15 | 
16 | a = 10
17 | b = -10
18 | c = 0
19 | 
20 | if a > 0 or b > 0:
21 |     print("Either of the number is greater than 0")
22 | else:
23 |     print("No number is greater than 0")
24 | 
25 | if b > 0 or c > 0:
26 |     print("Either of the number is greater than 0")
27 | else:
28 |     print("No number is greater than 0")
29 | 
30 | #---------Assignment Operators
31 | pdb.set_trace()
32 | x=10
33 | x+=20
34 | print(x)
35 | 
36 | #---------------Ternary Operators
37 | 
38 | a=10
39 | b=20
40 | x=30 if a<b else 40
41 | print(x)
42 | 
43 | #-------Special Operators Identity Operator
44 | 
45 | a=10
46 | b=10
47 | print(a is b)
48 | 
49 | x=True
50 | y=True
51 | print( x is y)
52 | 
53 | a="suresh"
54 | b="suresh"
55 | print(id(a))
56 | print(id(b))
57 | print(a is b)
58 | 
59 | list1=["one","two","three"]
60 | list2=["one","two","three"]
61 | print(id(list1))
62 | print(id(list2))
63 | print(list1 is list2)
64 | print(list1 is not list2)
65 | print(list1 == list2)
66 | 
67 | #----Special Operator Membership Operator
68 | 
69 | 
70 | x="hello learning Python is very easy!!!"
71 | print('h' in x)
72 | print('d' in x)
73 | print('d' not in x)
74 | print('Python' in x)
75 | 
76 | 
77 | #Operator Precedence
78 | print(3+10*2)
79 | print((3+10)*2)
80 | a=30
81 | b=20
82 | c=10
83 | d=5
84 | print((a+b)*c/d)
85 | print(a+(b*c)/d)
86 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/07_basic_datatypes.py:
--------------------------------------------------------------------------------
 1 | #basic datatypes
 2 | import pdb
 3 | 
 4 | print('Type of 4 is                         :',type(4))
 5 | print('Type of 4.0 is                       :',type(4.0))
 6 | print('Type of -4 is                        :',type(-4))
 7 | print('Type of 1j is                        :',type(1j))
 8 | print('Value of int 5 converted to float    :',float(5))
 9 | print('Value of decimal 44 with precision   :',float(44.555555555))
10 | print(7.6+8.7)
11 | print(round(7.6+8.7))
12 | print(round(7.6+8.7,1))
13 | print(round(1.1+1.1+1.1,1) == 3.3)
14 | #--------------------------------Basic Arithmetic Operators
15 | a=7.0
16 | b=11.0
17 | c=7
18 | d=11
19 | 
20 | print(a + b)
21 | print(a-b)
22 | print(a*b)
23 | print(a/b)
24 | print(c/d)
25 | # power
26 | print(c**2)
27 | #reminder
28 | print(d%c)
29 | #-------------------------------------Basic Conditional Operators(Relational and Equality Operators)
30 | a=3
31 | b=5
32 | pdb.set_trace()
33 | print(a==b)
34 | print(a!=b)
35 | print(a>b)
36 | print(a<b)
37 | print(type(True))
38 | print(bool(28))
39 | print(bool(-2.7))
40 | print(bool(0))
41 | print(bool('Jagadeesh'))
42 | print(bool(" "))
43 | print(bool("."))
44 | print(bool(""))
45 | print(str(False))
46 | print(int(True))
47 | print(5 + True)
48 | print(5 * True)
49 | print(5 * False)
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/08_sequence_datatypes.py:
--------------------------------------------------------------------------------
 1 | #sequence datatypes
 2 | #mutable - List, Set, Dictionary
 3 | #Immutable - String, Tuple
 4 | import pdb
 5 | 
 6 | #-------------------------------------------- String
 7 | welcome = "Hello World"
 8 | print(welcome)
 9 | print(welcome[0])
10 | print(welcome[-11])
11 | print(welcome[-1])
12 | s='abcd'
13 | print(s[0])
14 | # s[0]='1'
15 | # Gives error as we can't change s[0] as it is immutable type
16 | #------------------------------------------- String
17 | 
18 | s='abcd'
19 | s += 'efgh'
20 | print(s)
21 | print(s)
22 | #
23 | s='abcd'
24 | s='efgh'
25 | print(s)
26 | a='123'
27 | print(a[0])
28 | print(a)
29 | a='453'
30 | print(a[0])
31 | print(a)
32 | ########--------------------------LIST Datatype MUTABLE
33 | a=[12,'demo',12.2]
34 | print(a)
35 | print(a[0])
36 | a[0]=24
37 | print(a)
38 | a.append('jagadeesh')
39 | print(a)
40 | a.extend(['satheesh','suresh'])
41 | a[0]='twelve'
42 | print(a)
43 | del a[0]
44 | print(a)
45 | print(a[0:2])
46 | print(a[0:])
47 | a=[]
48 | a.append(12)
49 | print(a)
50 | 
51 | ###############--------------------------TUPLE Datatype IMMUTABLE
52 | a=(12,23)
53 | print(a)
54 | b=(12,23,'Jaga')
55 | print(b)
56 | print(b[0])
57 | #b[0]=11
58 | print(b)
59 | len(b)
60 | print(b[0:2])
61 | 
62 | #---------------------------- Slicing in List and String
63 | 
64 | my_list = [0,1,2,3,4,5,6,7,8,9]
65 | print(my_list[0])
66 | print(my_list[5])
67 | print(my_list[-1])
68 | print(my_list[-10])
69 | print(my_list[0:6])
70 | print(my_list[3:8])
71 | print(my_list[-7:-2])
72 | print(my_list[1:9])
73 | print(my_list[1:])
74 | print(my_list[:-1])
75 | print(my_list[:])
76 | #-------------------- using step func
77 | 
78 | print((my_list[2:-1:2]))
79 | print(my_list[-1:2:-1])
80 | print(my_list[-2:1:-2])
81 | print(my_list[::-2])
82 | pdb.set_trace()
83 | print(my_list[::-1])
84 | print(my_list[::1])


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/09_other_datatypes.py:
--------------------------------------------------------------------------------
  1 | #other datatypes
  2 | #--------------------------------Collection and Mappings types in Python
  3 | #Range Type and function, set type and dictionary
  4 | import pdb
  5 | a=list(range(6))
  6 | print(a)
  7 | b=list(range(2,6))
  8 | print(b)
  9 | c=tuple(range(6))
 10 | print(c)
 11 | d=tuple(range(2,6))
 12 | print(d)
 13 | e=list(range(0,11,2))
 14 | print(e)
 15 | ############ give empty list
 16 | e=list(range(11,2))
 17 | print(e)
 18 | e=list(range(-2,-11,2))
 19 | print(e)
 20 | ###########corrected
 21 | e=list(range(11,20))
 22 | print(e)
 23 | e=list(range(-2,-11,-2))
 24 | print(e)
 25 | 
 26 | ##################set elements  Order does not matter and data needs to be unique
 27 | 
 28 | a={1,'s',7.8}
 29 | print(a)
 30 | b=set([1,'B',6.9,1])
 31 | print(b)
 32 | b.add(5)
 33 | print(b)
 34 | c=set()
 35 | print(c)
 36 | print(dir(c))
 37 | My_set = {1,'s',7.8}
 38 | print(len(My_set))
 39 | for a in My_set:
 40 |      print(a)
 41 | c={1,2,3,4}
 42 | print(c)
 43 | d=set([1,2,3,4])
 44 | print(d)
 45 | e=set()
 46 | e.add(5)
 47 | for a in e:
 48 |      print(a)
 49 | e.update([6,3])
 50 | print(e)
 51 | e.remove(5)
 52 | print(e)
 53 | for a in e:
 54 |      print(a)
 55 | e.discard(6)
 56 | print(e)
 57 | e.discard(10)
 58 | print(e)
 59 | e.pop()
 60 | print(e)
 61 | 
 62 | a={1,2,3,4}
 63 | b={1,5,4,6}
 64 | c={9,10}
 65 | print(a|b)
 66 | print(a.union(b,c))
 67 | a={1,2,3,4}
 68 | b={1,5,4,6}
 69 | print(a&b)
 70 | print(a.intersection(b))
 71 | a={1,2,3,4}
 72 | b={1,2,7,8}
 73 | print(a-b)
 74 | print(a.difference(b))
 75 | 
 76 | #############-------------------- Frozen set A frozen set is a set whose values cannot be modified
 77 | a={1,2,3}
 78 | print(type(a))
 79 | b=frozenset(a)
 80 | print(type(b))
 81 | print(b)
 82 | #############-------------------- Dictionary Type
 83 | a={'john':150,'mac':200}
 84 | print(a['john'])
 85 | a['john']=900
 86 | print(a)
 87 | del a['mac']
 88 | print(a)
 89 | print(a.keys())
 90 | #####################
 91 | a=[]
 92 | print(type(a))
 93 | print(a)
 94 | b=set()
 95 | print(type(b))
 96 | print(b)
 97 | c=set([])
 98 | print(type(c))
 99 | print(c)
100 | d=tuple()
101 | print(type(d))
102 | print(d)
103 | e={}
104 | print(type(e))
105 | print(e)
106 | f=()
107 | print(type(f))
108 | print(f)
109 | a=[]
110 | b={}
111 | c=()
112 | d=set()
113 | print(type(a))
114 | print(type(b))
115 | print(type(c))
116 | print(type(d))
117 | a.append([3,6,7,8])
118 | print(a)
119 | 
120 | b={'key1':'geeks', 'key2':'for'}
121 | b.update({'key3':'geeks'})
122 | print(b)
123 | b['key3']='jack'
124 | print(b)
125 | 
126 | 
127 | d.add('a')
128 | print(d)
129 | d.update(('b','c','d'))
130 | print(d)
131 | d.add('a')
132 | print(d)
133 | d.add('b')
134 | print(d)
135 | d.update('c')
136 | print(d)
137 | d.add('d')
138 | print(d)
139 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/10_Looping.py:
--------------------------------------------------------------------------------
  1 | #Conditional Statements - if
  2 | name=input("Enter Name:")
  3 | if name=="Suresh" :
  4 |     print("Hello Suresh Good Morning")
  5 | print("How are you!!!")
  6 | 
  7 | #Conditional Statements - if else
  8 | name=input("Enter Name:")
  9 | if name=="Suresh" :
 10 |     print("Hello Suresh Good Morning")
 11 | else:
 12 |     print("Hello Guest Good Moring")
 13 | print("How are you!!!")
 14 | 
 15 | cricketer=input("Enter Your Favourite Cricketer:")
 16 | if cricketer=="sachin" :
 17 |     print("God of cricket")
 18 | elif cricketer=="dhoni":
 19 |     print("Best captain")
 20 | elif cricketer=="sehwag":
 21 |     print("Fearless Cricketer")
 22 | else :
 23 |     print("not sure")
 24 | 
 25 | #looping
 26 | # import pdb
 27 | # pdb.set_trace()
 28 | sum=0
 29 | for i in range(4):
 30 |     print('i:',i)
 31 |     print('sum:',sum)
 32 |     sum = sum +i
 33 |     print(sum)
 34 | print(sum)
 35 | 
 36 | sum = 0
 37 | i=0
 38 | while i < 4:
 39 |     sum = sum+i
 40 |     print('i:',i)
 41 |     print('sum:',sum)
 42 |     i = i + 1
 43 | print('final sum:',sum)
 44 | 
 45 | a=['banana','apple', 'microsoft']
 46 | for element in a:
 47 |     print(element)
 48 | import pdb
 49 | pdb.set_trace()
 50 | b=[1,2,3]
 51 | for element in b:
 52 |     print(element)
 53 | 
 54 | b=[1,2,3]
 55 | sum = 0
 56 | for element in b:
 57 |     sum = sum + element
 58 | print(sum)
 59 | 
 60 | total = 0
 61 | for i in range(1,5):
 62 |     total += i
 63 | print(total)
 64 | 
 65 | total = 0
 66 | for i in range(1,8):
 67 |      if i%2 == 0:
 68 |         total += i
 69 | print(total)
 70 | 
 71 | 
 72 | total = 0
 73 | for i in range(1,30):
 74 |      if i%3 == 0 or i%5 ==0:
 75 |         print(i)
 76 | 
 77 | #To display odd numbers from 0 to 20
 78 | for x in range(21) :
 79 |     if (x % 2 != 0):
 80 |         print(x)
 81 | 
 82 | #To display the sum of first n numbers
 83 | n=int(input("Enter number:"))
 84 | sum=0
 85 | i=1
 86 | while i<=n:
 87 |     sum=sum+i
 88 |     i=i+1
 89 | print("The sum of first",n,"numbers is :",sum)
 90 | 
 91 | #Sometimes we can take a loop inside another loop,which are also known as nested loops.
 92 | for i in range(4):
 93 |     for j in range(4):
 94 |         print("i=", i, " j=", j)
 95 | 
 96 | #break statement
 97 | for i in range(10):
 98 |     if i==7:
 99 |         print("processing is enough..plz break")
100 |         break
101 | print(i)
102 | 
103 | 
104 | cart=[10,20,600,60,70]
105 | for item in cart:
106 |     if item>500:
107 |         print("To place this order insurance must be required")
108 |         break
109 |     print(item)
110 | 
111 | #continue statement : To print odd numbers in the range 0 to 9
112 | for i in range(10):
113 |     if i % 2 == 0:
114 |         continue
115 |     print(i)
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/11_list_comprehension.py:
--------------------------------------------------------------------------------
 1 | #list comprehension
 2 | import pdb
 3 | # pdb.set_trace()
 4 | #--------------------List comprehension
 5 | # Using for loop
 6 | my_list = []
 7 | for x in range(10):
 8 |     my_list.append(x * 2)
 9 | print(my_list)
10 | 
11 | 
12 | #same using list comprehension
13 | comp_list = [x * 2 for x in range(10)]
14 | print(comp_list)
15 | 
16 | #Another best example
17 | comp_list = [x ** 2 for x in range(7) if x % 2 == 0]
18 | print(comp_list)
19 | 
20 | iter_string = "some text"
21 | comp_list = [x for x in iter_string if x !=" "]
22 | print(comp_list)
23 | 
24 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/12_Iterator.py:
--------------------------------------------------------------------------------
 1 | #Iterator
 2 | import pdb
 3 | pdb.set_trace()
 4 | #--------------------Iterator
 5 | x=[1,2,3]
 6 | for i in x:
 7 |     print(i)
 8 | 
 9 | x=[1,2,3]
10 | iter1=iter(x)
11 | print(next(iter1))
12 | print(next(iter1))
13 | 
14 | x=[1,2,3]
15 | iter1=iter(x)
16 | while True:
17 |     try:
18 |         print(next(iter1))
19 |     except StopIteration:
20 |         break
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/13_generator.py:
--------------------------------------------------------------------------------
1 | #generator
2 | x = [1, 2, 3]
3 | g = (i**2 for i in x)
4 | print(next(g))
5 | print(next(g))


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/14_fizzbuzz.py:
--------------------------------------------------------------------------------
 1 | #fizzbuzz module
 2 | def fizzBuzz(n):
 3 |     # Write your code here
 4 |     for fizzBuzz in range(1,n+1):
 5 |         if fizzBuzz % 15 == 0:
 6 |             print('FizzBuzz')
 7 |             continue
 8 |         elif fizzBuzz % 3 ==0:
 9 |             print('Fizz')
10 |             continue
11 |         elif fizzBuzz % 5 == 0:
12 |             print('Buzz')
13 |             continue
14 |         print(fizzBuzz)
15 | 
16 | if __name__ == '__main__':
17 |     x = int(input("Enter number for range"))
18 |     fizzBuzz(x)


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/15A_class_variables.py:
--------------------------------------------------------------------------------
 1 | class Test:
 2 |     def __init__(self):
 3 |         self.a = 10
 4 |         self.b = 20
 5 | 
 6 |     def display(self):
 7 |         print(self.a)
 8 |         print(self.b)
 9 | t=Test()
10 | t.display()
11 | print(t.a,t.b)
12 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/15B_class_varaibles.py:
--------------------------------------------------------------------------------
1 | class Test:
2 |     x=10
3 |     def __init__(self):
4 |         self.y=20
5 | t1=Test()
6 | print('t1:',t1.x,t1.y)
7 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/15C_class_variables.py:
--------------------------------------------------------------------------------
 1 | class Test:
 2 |     def m1(self):
 3 |         a=1000
 4 |         print(a)
 5 |     def m2(self):
 6 |         b=2000
 7 |         print(b)
 8 | t=Test()
 9 | t.m1()
10 | t.m2()


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/15D_class_methods.py:
--------------------------------------------------------------------------------
 1 | class Student:
 2 |     def __init__(self,name,marks):
 3 |         self.name=name
 4 |         self.marks=marks
 5 |     def display(self):
 6 |         print('Hi',self.name)
 7 |         print('Your Marks are:',self.marks)
 8 |     def grade(self):
 9 |         if self.marks>=60:
10 |             print('You got First Grade')
11 |         elif self.marks>=50:
12 |             print('Yout got Second Grade')
13 |         elif self.marks>=35:
14 |             print('You got Third Grade')
15 |         else:
16 |             print('You are Failed')
17 | n=int(input('Enter number of students:'))
18 | for i in range(n):
19 |     name=input('Enter Name:')
20 |     marks=int(input('Enter Marks:'))
21 |     s= Student(name,marks)
22 |     s.display()
23 |     s.grade()
24 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/15E_class_methods.py:
--------------------------------------------------------------------------------
1 | class Animal:
2 |     lEgs=4
3 |     @classmethod
4 |     def walk(cls,name):
5 |         print('{} walks with {} lEgs...'.format(name,cls.lEgs))
6 | Animal.walk('Dog')
7 | Animal.walk('Cat')


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/15F_class_methods.py:
--------------------------------------------------------------------------------
 1 | class TestMath:
 2 |     @staticmethod
 3 |     def add(x,y):
 4 |         print('The Sum:',x+y)
 5 |     @staticmethod
 6 |     def product(x,y):
 7 |         print('The Product:',x*y)
 8 |     @staticmethod
 9 |     def average(x,y):
10 |         print('The average:',(x+y)/2)
11 | TestMath.add(10,20)
12 | TestMath.product(10,20)
13 | TestMath.average(10,20)


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/15_class.py:
--------------------------------------------------------------------------------
 1 | #class
 2 | class Person:
 3 |     def __init__(self, fname, lname):
 4 |         self.fname = fname
 5 |         self.lname = lname
 6 |         print(fname,lname)
 7 | 
 8 | #if __name__ == '__main__':
 9 | p1 = Person('George', 'Smith')
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/16_Inheritance.py:
--------------------------------------------------------------------------------
  1 | #Inheritance
  2 | class Person:
  3 |     def __init__(self, fname, lname):
  4 |         self.fname = fname
  5 |         self.lname = lname
  6 | class Employee(Person):
  7 |     all_employees = []
  8 |     def __init__(self, fname, lname, empid):
  9 |         Person.__init__(self, fname, lname)
 10 |         self.empid = empid
 11 |         Employee.all_employees.append(self)
 12 | 
 13 | if __name__ == '__main__':
 14 |     p1 = Person('George', 'smith')
 15 |     print(p1.fname, '-', p1.lname)
 16 |     e1 = Employee('Jack', 'simmons', 456342)
 17 |     e2 = Employee('John', 'williams', 123656)
 18 |     print(e1.fname, '-', e1.empid)
 19 |     print(e2.fname, '-', e2.empid)
 20 |    
 21 | 
 22 | # ----------------------------------------------------------------
 23 | 
 24 | # there are four types of inheritance in Oops concept. Let's go through all of them.
 25 | 
 26 | # Let's first understand simple inheritance
 27 | class All_calculation:  # it's called parent class
 28 |     def __init__(self, a, b):
 29 |         self.a = a
 30 |         self.b = b
 31 | 
 32 |     def add(self):
 33 |         print(f'addition of two given number is {self.a + self.b}')
 34 | 
 35 |     def substraction(self):
 36 |         return f'substraction of two given number is {self.a - self.b}'
 37 | 
 38 | 
 39 | class Number(All_calculation):  # it's called subclass or drived class or child class
 40 | 
 41 |     def multi(self):
 42 |         return f'multification of two given number is {self.a * self.b}'
 43 | 
 44 | 
 45 | obj = Number(3, 2)
 46 | obj.add()
 47 | obj = All_calculation(1, 2)
 48 | obj.substraction()
 49 | 
 50 | 
 51 | # -----------------------------------------------------------------------------------------------------------------
 52 | 
 53 | # Now, let's understand Multi-level Inheritance
 54 | 
 55 | class car:  # Grand class
 56 |     def __init__(self, name, com):
 57 |         self.name = name
 58 |         self.com = com
 59 | 
 60 |     def color(self):
 61 |         print('This is black car')
 62 | 
 63 | 
 64 | class speed(car):  # Parent class
 65 |     def speed(self):
 66 |         print(f'if car name is {self.name} and company is {self.com}, speed is 100 km/h')
 67 | 
 68 |     def doors(self):
 69 |         print('this is two seaters car, So it may have two doors.')
 70 | 
 71 | 
 72 | class shape(speed):  # child class
 73 |     def shape(self):
 74 |         print(f'if speed is 100km/hour, car shape might be sedan')
 75 | 
 76 |     def Buyer(self):
 77 |         print(f'buyer must have down payment near 10 lacs.')
 78 | 
 79 | 
 80 | obj = shape('Sedan', 'Mercedese')
 81 | obj.Buyer()
 82 | obj.speed()
 83 | obj.color()
 84 | 
 85 | 
 86 | # --------------------------------------------------------------------------------------------------------------------------
 87 | 
 88 | 
 89 | # Now, let's understand Multiple Inheritance
 90 | 
 91 | class standard:  # Parent class
 92 |     def __init__(self, name, std):
 93 |         self.name = name
 94 |         self.std = std
 95 | 
 96 |     def fees(self):
 97 |         print(f'standard of {self.std} is $7000')
 98 | 
 99 | 
100 | class Subject:  # Parent class
101 |     def sub(self):
102 |         print('student has to learn total main three subjects')
103 | 
104 |     def sub_name(self):
105 |         print('"Math", "Science", "English", "Sanskrit"')
106 | 
107 | 
108 | class Student(standard, Subject):  # Child class, inherited from multiple class
109 |     def Uniform(self):
110 |         print(f'students have to wear white shirt and blue trouser')
111 | 
112 |     def Leave(self):
113 |         print(f'Student can take leave three days off for sick')
114 | 
115 | 
116 | obj = Student('Bhavik', 7)
117 | print(obj.std)
118 | obj.fees()
119 | obj.sub_name()
120 | obj.Leave()
121 | 
122 | # -------------------------------------------------------------------------------------------------------------------------
123 | 
124 | 
125 | # Now let's understand the hierarchical inheritance
126 | 
127 | class provide_email:  # parent class
128 |     def __init__(self, email, password):
129 |         self.email = email
130 |         self.password = password
131 |         self.name = 'bhavik'
132 | 
133 | class login(provide_email):  # child class
134 |     def login_detail(self):
135 |         if self.email == 'gajerabhavik915':
136 |             print('you are successfully logged in')
137 |         else:
138 |             print('please provide correct email')
139 | 
140 |     def check_balance(self):
141 |         self.login_detail()
142 |         if self.email == 'gajerabhavik915':
143 |             print('your available balance is $200000')
144 |         else:
145 |             print('first log in, check your balance')
146 | 
147 | 
148 | class create_account(provide_email):  # child class
149 |     def create_acc(self):
150 |         if self.email == 'gajerabhavik915':
151 |             print('your account is successfully created')
152 |         else:
153 |             print('please provide bhavik\'s email')
154 | 
155 |     def check_balance(self):
156 |         self.create_acc()
157 |         if self.email == 'gajerabhavik915':
158 |             print('your available balance is $200000')
159 |         else:
160 |             print('first log in, check your balance')
161 | 
162 | 
163 | # obj = create_account('gajerabhavik915', 7573073)
164 | # # obj.create_acc()
165 | # obj.check_balance()
166 | 
167 | obj1 = login('gajerabhavik915', 7573073)
168 | obj1.login_detail()
169 | 
170 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/17A_Polymorphism.py:
--------------------------------------------------------------------------------
 1 | class Test:
 2 |     def m1(self):
 3 |         print('no-arg method')
 4 |     def m1(self,a):
 5 |         print('one-arg method')
 6 |     def m1(self,a,b):
 7 |         print('two-arg method')
 8 | 
 9 | t=Test()
10 | #t.m1(10)
11 | t.m1(10,20)


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/17_Polymorphism.py:
--------------------------------------------------------------------------------
 1 | #Polymorphism
 2 | import pdb
 3 | class Person:
 4 |     def __init__(self, fname, lname):
 5 |         self.fname = fname
 6 |         self.lname = lname
 7 | 
 8 | class Employee(Person):
 9 |     all_employees = []
10 |     def __init__(self, fname, lname, empid):
11 |         Person.__init__(self, fname, lname)
12 |         self.empid = empid
13 |         Employee.all_employees.append(self)
14 |     def getSalary(self):
15 |         return 'You get Monthly salary.'
16 |     def getBonus(self):
17 |         return 'You are eligible for Bonus.'
18 | 
19 | class ContractEmployee(Employee):
20 |     def getSalary(self):
21 |         return 'You will not get Salary from Organization.'
22 |     def getBonus(self):
23 |         return 'You are not eligible for Bonus.'
24 | 
25 | if __name__ == '__main__':
26 |     #pdb.set_trace()
27 |     e1 = Employee('Jack', 'simmons', 456342)
28 |     e2 = ContractEmployee('John', 'williams', 123656)
29 |     print(e1.getBonus())
30 |     print(e2.getBonus())


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/18_Encapsulation.py:
--------------------------------------------------------------------------------
 1 | #Encapsulation
 2 | class Person:
 3 |     def __init__(self, fname, lname):
 4 |         self.fname = fname
 5 |         self.lname = lname
 6 | 
 7 | class Employee(Person):
 8 |     all_employees = []
 9 |     def __init__(self, fname, lname, empid):
10 |         Person.__init__(self, fname, lname)
11 |         self.__empid = empid
12 |         Employee.all_employees.append(self)
13 |     def getEmpid(self):
14 |     #print(e1.__empid)
15 |         return self.__empid
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     e1 = Employee('Jack', 'simmons', 456342)
20 |     print(e1.fname, e1.lname)
21 |     print(e1.getEmpid())
22 |     print(e1.__empid)
23 | 
24 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/19_exception.py:
--------------------------------------------------------------------------------
 1 | #exception
 2 | def divide(a,b):
 3 |         try:
 4 |             result = a / b
 5 |             return result
 6 |         except ZeroDivisionError:
 7 |             print("Dividing by Zero.")
 8 |         finally:
 9 |             print("In finally clause.")
10 | 
11 | if __name__ == '__main__':
12 |     try:
13 |         a = pow(2, 4)
14 |         print("Value of 'a' :", a)
15 |         b = pow(2, 'hello')  # results in exception
16 |         print("Value of 'b' :", b)
17 |     except TypeError as e:
18 |         print('oops!!!')
19 |     print('Out of try ... except.')
20 | 
21 |     try:
22 |         a = 2;
23 |         b = 'hello'
24 |         if not (isinstance(a, int)
25 |                 and isinstance(b, int)):
26 |             raise TypeError('Two inputs must be integers.')
27 |         c = a ** b
28 |         print(c)
29 |     except TypeError as e:
30 |         print(e)
31 |     print('First call')
32 |     print(divide(14, 7))
33 |     print('Second call')
34 |     print(divide(14, 0))


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/20A_pandas.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | data = [0,1,2,3,4,5]
 4 | list = [['Alex',10],['Bob',12],['Clarke',13]]
 5 | dict = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
 6 | df = pd.DataFrame(data)
 7 | df_list = pd.DataFrame(list,columns=['Name','Age'])
 8 | df_dict = pd.DataFrame(dict)
 9 | df_series = pd.Series(np.random.randn(4))
10 | print(df)
11 | print(df_list)
12 | print(df_dict)
13 | print(df_series)
14 | print(df_list.head(2))
15 | print(df_dict.tail(2))


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/20_pandas_csv.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from matplotlib import pyplot as plt
 3 | data = pd.read_csv('input.csv')
 4 | print(type(data))
 5 | print (data)
 6 | print(data[0:5]['salary'])
 7 | print (data.loc[:,['salary','name']])
 8 | print (data.loc[[1,3,5],['salary','name']])
 9 | print (data.loc[2:6,['salary','name']])
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/21A_semilog_plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plot
 2 | years = [1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2017]
 3 | indexValues = [68, 81, 71, 244, 151, 200, 615, 809, 824, 2633, 10787, 11577, 20656]
 4 | plot.grid(True, which="both")
 5 | plot.semilogy(years, indexValues)
 6 | plot.ylim([10, 21000])
 7 | plot.xlim([1900, 2020])
 8 | plot.title('Y axis in Semilog using Python Matplotlib')
 9 | plot.xlabel('Year')
10 | plot.ylabel('Stock market index')
11 | plot.show()


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/21B_Barchart_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plotter
 3 | numberOfYears = 5
 4 | averageReturns = (13, 15, 15, 14, 12)
 5 | categories = ('2012', '2013', '2014', '2015', '2016')
 6 | barWidth = 0.4
 7 | barOpacity = 0.5
 8 | barChart = plotter.bar(categories,averageReturns,barWidth,
 9 |                        alpha=barOpacity,
10 |                        color='blue',
11 |                        label=' Stock Returns')
12 | plotter.xlabel('Year')
13 | plotter.ylabel('Average returns from Stock Market %')
14 | plotter.title('Annual Stock Market Returns %')
15 | plotter.xticks(categories)
16 | plotter.legend()
17 | plotter.tight_layout()
18 | plotter.show()


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/21C_PieChart_plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plotter
 2 | sectors_label = 'Finance', 'Technology', 'Oil&Gas', 'BasicMaterial', 'Industrials', 'ConsumerGoods','Healthcare','ConsumerService','Others'
 3 | sectors_percent = [20.00, 11.5, 19.5, 7.25, 18.75, 6.75,6.75,9.25,0.25]
 4 | figureObject, axesObject = plotter.subplots()
 5 | axesObject.pie(sectors_percent,
 6 |                labels=sectors_label,
 7 |                autopct='%1.2f',
 8 |                startangle=90)
 9 | axesObject.axis('equal')
10 | plotter.show()
11 | explodeTuple = (0.1, 0.0, 0.0, 0.0, 0.0, 0.0,0.0,0.0,0.0)
12 | axesObject.pie(sectors_percent,explode=explodeTuple,
13 |                labels=sectors_label,
14 |                autopct='%1.2f',
15 |                startangle=90)
16 | plotter.show()


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/21D_Piechart_explode.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plotter
 2 | sectors_label = 'Finance', 'Technology', 'Oil&Gas', 'BasicMaterial', 'Industrials', 'ConsumerGoods','Healthcare','ConsumerService','Others'
 3 | sectors_percent = [20.00, 11.5, 19.5, 7.25, 18.75, 6.75,6.75,9.25,0.25]
 4 | figureObject, axesObject = plotter.subplots()
 5 | explodeTuple = (0.0, 0.0, 0.1, 0.0, 0.0, 0.0,0.0,0.0,0.0)
 6 | axesObject.pie(sectors_percent,explode=explodeTuple,
 7 |                labels=sectors_label,
 8 |                autopct='%1.2f',
 9 |                startangle=90)
10 | axesObject.axis('equal')
11 | plotter.show()
12 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/21E_scatter_plot.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plot
2 | import numpy as np
3 | xData = np.random.random_integers(18, 50, 50)
4 | yData = np.random.random_integers(200, 800, 50)
5 | plot.scatter(xData, yData)
6 | plot.title('Hypothetical:Student age group and GMAT Score')
7 | plot.xlabel('Age')
8 | plot.ylabel('Score')
9 | plot.show()


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/21F_Piechart_csv.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from matplotlib import pyplot as plt
 3 | data = pd.read_csv('input.csv')
 4 | salary_data= data.groupby('dept')[['salary']].sum()
 5 | print(salary_data)
 6 | plt.pie(salary_data['salary'], labels = salary_data.index)
 7 | plt.show()
 8 | 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/22_selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | driver = webdriver.Chrome(executable_path="C:\chromedriver_win32\chromedriver.exe")
3 | driver.implicitly_wait(100)
4 | driver.get("http://demo.guru99.com/test/newtours/")
5 | driver.find_element_by_xpath("//tbody/tr[2]/td[2]/input[1]").send_keys("mercury")
6 | driver.find_element_by_name("password").send_keys("mercury")
7 | driver.find_element_by_name("login").submit()
8 | driver.close()
9 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/23_DB.py:
--------------------------------------------------------------------------------
1 | import mysql.connector as mysql
2 | db = mysql.connect(host = "localhost",user = "jaga",passwd="Home@123",database="sql_practise")
3 | query = "SELECT * FROM Orders"
4 | cursor.execute(query)


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/Calculator.py:
--------------------------------------------------------------------------------
  1 | #calculator
  2 | import operation
  3 | import math
  4 | 
  5 | 
  6 | # def cls():
  7 | #    print "\n"*100
  8 | 
  9 | def main_menu_dis():
 10 |     #    cls()
 11 |     print('#' * 100)
 12 |     print('############ CALCULATOR######################')
 13 |     print('#' * 100)
 14 |     print('          1. Simple calc                     ')
 15 |     print('          2. sci calc                        ')
 16 |     print('          3. exit                            ')
 17 |     print("Enter your Choice")
 18 |     num = int(input())
 19 |     menu_choice(num)
 20 | 
 21 | 
 22 | def menu_choice(num ):
 23 |     print(num)
 24 |     if(num == 1):
 25 |         print('simple')
 26 |         simple_calc_menu()
 27 |         num2 = int(input("Enter option for simple calculator"))
 28 |         sim_calc_menu_choice(num2)
 29 |     elif(num == 2):
 30 |         sci_calc_menu()
 31 |         num2 = int(input("Enter option for scientific calculator"))
 32 |         sci_calc_menu_choice(num2)
 33 |     else:
 34 |         print('invalid')
 35 | 
 36 | 
 37 | def simple_calc_menu():
 38 | 
 39 |     print('             Simple calc                     ')
 40 |     print('          1. ADD                             ')
 41 |     print('          2. SUB                             ')
 42 |     print('          3. MUL                             ')
 43 |     print('          4. DIV                             ')
 44 |     print('          5. GO BACK                         ')
 45 |     print("Enter your Choice")
 46 | 
 47 | 
 48 | def sci_calc_menu():
 49 |     print('             Scientific Calculator           ')
 50 |     print('          1. sin                             ')
 51 |     print('          2. cos                             ')
 52 |     print('          3. power of                        ')
 53 |     print('          4. square root                     ')
 54 |     print('          5. GO BACK                         ')
 55 |     print("Enter your Choice")
 56 | 
 57 | 
 58 | # -----------------------------------------------------------------------------------------------
 59 | 
 60 | def sim_calc_menu_choice(num2):
 61 |     if num2 == 1:
 62 |         print("Enter two Numbers to be added:")
 63 |         a = int(input())
 64 |         b = int(input())
 65 |         operation.my_add(a, b)
 66 |         print("Sum of two numbers is:", operation.my_add(a, b))
 67 |         print("Press any key to continue")
 68 |         num3 = int(input())
 69 |         simple_calc_menu()
 70 |         num2 = int(input())
 71 |         sim_calc_menu_choice(num2)
 72 |     elif num2 == 2:
 73 |         print("Enter two Numbers to be subtracted:")
 74 |         a = int(input())
 75 |         b = int(input())
 76 |         operation.my_sub(a, b)
 77 |         print("difference of two numbers is:", operation.my_sub(a, b))
 78 |         print("Press any key to continue")
 79 |         num3 = int(input())
 80 |         simple_calc_menu()
 81 |         num2 = int(input())
 82 |         sim_calc_menu_choice(num2)
 83 |     elif num2 == 3:
 84 |         print("Enter two Numbers to be multiplyed:")
 85 |         a = int(input())
 86 |         b = int(input())
 87 |         operation.my_mul(a, b)
 88 |         print("Result of multiplication of two numbers is:", operation.my_mul(a, b))
 89 |         num3 = int(input())
 90 |         simple_calc_menu()
 91 |         num2 = int(input())
 92 |         sim_calc_menu_choice(num2)
 93 |     elif num2 == 4:
 94 |         print("Enter two Numbers to be divided:")
 95 |         a = int(input())
 96 |         b = int(input())
 97 |         operation.my_div(a, b)
 98 |         print("Result of division of two numbers is:", operation.my_div(a, b))
 99 |         num3 = int(input())
100 |         simple_calc_menu()
101 |         num2 = int(input())
102 |         sim_calc_menu_choice(num2)
103 |     elif num2 == 5:
104 |         main_menu_dis()
105 |     else:
106 |         print('Invalid Choice')
107 | 
108 | 
109 | def sci_calc_menu_choice(num2):
110 |     if num2 == 1:
111 |         print("Enter degree:")
112 |         a = int(input())
113 |         operation.my_sin(a)
114 |         print("Sin Value is :", operation.my_sin(a))
115 |         simple_calc_menu()
116 |         num2 = int(input())
117 |         sim_calc_menu_choice(num2)
118 |     elif num2 == 2:
119 |         print("Enter degree:")
120 |         a = int(input())
121 |         operation.my_cos(a)
122 |         print("cos Value is :", operation.my_cos(a))
123 |         simple_calc_menu()
124 |         num2 = int(input())
125 |         sim_calc_menu_choice(num2)
126 |     elif num2 == 3:
127 |         print("Enter two number to calculate power:")
128 |         a = int(input())
129 |         b = int(input())
130 |         operation.my_pow(a, b)
131 |         print("power of two  Value is :", operation.my_pow(a, b))
132 |         simple_calc_menu()
133 |         num2 = int(input())
134 |         sim_calc_menu_choice(num2)
135 |     elif num2 == 4:
136 |         print("Enter number to which square root needs to be calculated:")
137 |         a = int(input())
138 |         operation.my_sqrt(a)
139 |         print("power of two  Value is :", operation.my_sqrt(a))
140 |         simple_calc_menu()
141 |         num2 = int(input())
142 |         sim_calc_menu_choice(num2)
143 |     elif num2 == 5:
144 |         main_menu_dis()
145 |     else:
146 |         print('Invalid Choice')
147 | 
148 | 
149 | main_menu_dis()
150 | 
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/input.csv:
--------------------------------------------------------------------------------
1 | id,name,salary,start_date,dept
2 | 1,Rick,623.3,2012-01-01,IT
3 | 2,Dan,515.2,2013-09-23,Operations
4 | 3,Tusar,611,2014-11-15,IT
5 | 4,Ryan,729,2014-05-11,HR
6 | 5,Gary,843.25,2015-03-27,Finance
7 | 6,Rasmi,578,2013-05-21,IT
8 | 7,Pranab,632.8,2013-07-30,Operations
9 | 8,Guru,722.5,2014-06-17,Finance


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Python/operation.py:
--------------------------------------------------------------------------------
 1 | #operation
 2 | import math
 3 | 
 4 | def my_add(a,b):
 5 |     c=a+b
 6 |     return c
 7 | 
 8 | def my_sub(a,b):
 9 |     c= a-b
10 |     return c
11 | 
12 | def my_mul(a,b):
13 |     c= a*b
14 |     return c
15 | 
16 | def my_div(a,b):
17 |     c= a/b
18 |     return c
19 | 
20 | def my_sin(a):
21 |     d=(math.pi/180)*90
22 |     c=math.sin(d)
23 |     return c
24 | 
25 | def my_cos(a):
26 |     d=(math.pi/180)*90
27 |     c=math.cos(d)
28 |     return c
29 | 
30 | def my_pow(a,b):
31 |     c=math.pow(a,b)
32 |     return c
33 | 
34 | def my_sqrt(a):
35 |     c=math.sqrt(a)
36 |     return c
37 | 
38 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Shell Scripting/01 - Linux commands.md:
--------------------------------------------------------------------------------
 1 | Basic Linux commands:
 2 | 
 3 |   - root directory       **/**
 4 | 
 5 |   - Home directory       **~**
 6 | 
 7 |   - Current director     **.**
 8 |   
 9 |   - Parent directory     **.**  
10 |   
11 |   - List the directories and files  **Ls**
12 |   
13 |  -  List all the files including hidden files   **ls -a**
14 |  
15 |  - List files sorted by modification date    **Ls -lt**
16 |  
17 | - Create new directory   **Mkdir**
18 | 
19 | - Create 3 directories at one time **Mkdir directory1 directory2 directory3**
20 | 
21 | - Create a file by creating parent and subdirectories as well **Mkdir -p /parent/subdirec/file**
22 | 
23 | - Delete/remove the directory.   Will work only when the directory is empty **Rmdir**
24 | 
25 | - Remove a directory recursively     **Rm -R directory1**
26 | 
27 | - Copy the files or directories  **Cp**
28 | 
29 | - Copy the files from A directory recursively to B directory  **Cp -R A B**
30 |   
31 | - Move a file from one folder to another and also rename a file **Mv**     
32 |     
33 | - Change directory  **Cd**           
34 |     
35 | - Copy files or directories between a local and remote system  **Scp**     
36 |     
37 | - contents of the file **cat file1**
38 |  
39 | - General search command or find pattern in text files   **grep 'state' /user/jaga/file1.txt**
40 | 
41 | - Display first 10 records from file **head file1**
42 | 
43 | - Display last 10 records from file **tail file1**
44 | 
45 | - Display first 5 records from file  **head -5 file1**
46 | 
47 | - Display last 5 records from file  **tail -5 file1**
48 | 
49 | - Display the number of lines in a file  **wc -l /user/jaga/file1.txt**
50 | 
51 | - Edit a file  **vi /user/jaga/file1.txt**
52 | 
53 | - Save a file **wq**
54 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Shell Scripting/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/01 - Data Engineering Basics/Shell Scripting/README.md


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Spark/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/01 - Data Engineering Basics/Spark/README.md


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Sql/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/01 - Data Engineering Basics/Sql/README.md


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Sqoop/01 -  What is Sqoop.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Sqoop/02 - How Sqoop works.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Sqoop/03 - Sqoop import.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Sqoop/04 -  Sqoop incremental load.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Sqoop/05 - Sqoop export.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Sqoop/06 - Sqoop compression.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Sqoop/07 - Sqoop job.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01 - Data Engineering Basics/Sqoop/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/01 - Data Engineering Basics/Sqoop/README.md


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Airflow/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/02 - Data Engineering Intermediate/Airflow/README.md


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Hbase/01 Hbase Datamodel .md:
--------------------------------------------------------------------------------
 1 | ![image](https://user-images.githubusercontent.com/42135673/232831533-3ae1903b-da2b-4531-a4cc-d0408729a163.png)
 2 | 
 3 | From HBase’s point of view, a row is a logical unit, not a physical one.
 4 | 
 5 |    In HBase, rows are subdivided along their column families.
 6 |    
 7 |    For the client and the user, however, a row appears as one logical unit.
 8 |    
 9 |    As data in a particular column family is stored together in one HFile, fetching a database row means to collect all cells (latest timestamp) 
10 |    in all column families that are referenced by the same row key.
11 |    
12 |    For writing to a particular cell, this has similar implications as only the HFile that holds the column family containing the cell will be touched.
13 |    
14 |     In fact, a “row” is only one special way of viewing related data. The HBase reference states that HBase should rather be thought of 
15 |     as a “multi-dimensional hash map”, where the combination (namespace, table, column family, column qualifier, time stamp) are the key to a particular value. 
16 |     This how HBase relates to KV-based NoSQL solutions.
17 |  
18 |    Every data object stored in an HBase cell is interpreted as an array of bytes, even column qualifiers/names and row keys are considered as byte arrays.
19 |    
20 |    The mechanism can be used to easily store binary data (e.g. MOBs – medium sized objects, such as images, 10 KB – 10MB).
21 |    
22 |    While this gives a maximum of flexibility to the database developer, the validation of data types and correct value ranges must be done inside the application.
23 |  
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Hbase/02 Hbase commands.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Hbase/03 Phoenix Overview.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Hbase/04 Phoenix commands.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Hbase/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/02 - Data Engineering Intermediate/Hbase/README.md


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Hive/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/02 - Data Engineering Intermediate/Hive/README.md


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Kafka/01 What is messaging.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Messaging is a key data integration strategy employed in many distributed environments such as the cloud. 
 3 | 
 4 | Messaging supports asynchronous operations, enabling you to decouple a process that consumes a service from the process that implements the service.
 5 | 
 6 | ![image](https://user-images.githubusercontent.com/42135673/232835267-9521697a-55eb-4f0f-a101-4c45095313d5.png)
 7 | 
 8 | Messaging connects multiple applications in an exchange of data.
 9 | 
10 | Messaging uses an encapsulated asynchronous approach to exchange data through a network.
11 | 
12 | A traditional messaging system has two models of abstraction:
13 | 
14 |     Queue – a message channel where a single message is received exactly by one consumer in a point-to-point message-queue pattern. 
15 |             If there are no consumers available, the message is retained until a consumer processes the message.
16 | 
17 |     Topic -  a message feed that implements the publish-subscribe pattern and broadcasts messages to consumers that subscribe to that topic.
18 |     
19 | Messaging Models:
20 | 
21 |   1. Point to Point 
22 |   2. Publish and Subscribe
23 |  
24 |  ![image](https://user-images.githubusercontent.com/42135673/232836338-409f9e17-9865-47e2-a67f-e9818d9c2e55.png)
25 | 
26 | Kafka is an example of publish-and-subscribe messaging model
27 | 
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Kafka/02 Kafka Overview.md:
--------------------------------------------------------------------------------
 1 | ![image](https://user-images.githubusercontent.com/42135673/232836795-c3766164-6735-4e47-89a3-7dcfd103c88e.png)
 2 | 
 3 | Kafka is a unique distributed publish-subscribe messaging system written in the Scala language with multi-language support and 
 4 | runs on the Java Virtual Machine (JVM).
 5 | 
 6 | 
 7 | Kafka has high-throughput and is built to scale-out in a distributed model on multiple servers.
 8 | 
 9 | Kafka persists messages on disk and can be used for batched consumption as well as real time applications.
10 | 
11 | 


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Kafka/03 Kafka Architecture.md:
--------------------------------------------------------------------------------
 1 | ![image](https://user-images.githubusercontent.com/42135673/232838913-e37e4947-3ce5-4b62-bda1-6ce02a2f672e.png)
 2 | 
 3 | 
 4 | Kafka maintains data feeds of messages in categories called topics.
 5 | 
 6 | Messages are published to topics in Kafka by processes called producers.
 7 | 
 8 | Messages are consumed from topics in Kafka by consuming processes called subscribers.
 9 | 
10 | Kafka is run in a distributed environment cluster comprising of one or more servers each of which is called a broker.
11 | 
12 | Kafka generalizes (combines) the traditional messaging model of queuing (with a queue) and publish-subscribe (with a topic) 
13 | using a single consumer abstraction called – a consumer group.
14 | 
15 | Kafka message delivery is based on subscriber group name or identity.
16 | 
17 | Two or more separate consumer subscriber instances that need to get messages at least once from Kafka will subscribe to Kafka based on
18 | using different consumer group name or identification. All messages will be broadcast to all consumers in a publish-subscribe pattern.
19 | 
20 | If two or more consumer subscriber instances use the same consumer group name, then messages are load-balanced (evenly divided) across all instances 
21 | of consumers with the same consumer group name and works like the traditional queue model.
22 | 


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Kafka/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/02 - Data Engineering Intermediate/Kafka/README.md


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Modelling/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/02 - Data Engineering Intermediate/Modelling/README.md


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Spark/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/02 - Data Engineering Intermediate/Spark/README.md


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Tableau/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/02 - Data Engineering Intermediate/Tableau/README.md


--------------------------------------------------------------------------------
/02 - Data Engineering Intermediate/Teradata/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/02 - Data Engineering Intermediate/Teradata/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/AWS/00 AWS Important Services.md:
--------------------------------------------------------------------------------
 1 | **Analytics**
 2 |    - Athena
 3 |    - Amazon Elasticsearch Service (Amazon ES) 
 4 |   - Amazon EMR 
 5 |   - AWS Glue
 6 |   - Amazon Kinesis 
 7 |   - Amazon QuickSight 
 8 | 
 9 | **Storage**
10 |   - Amazon Elastic Block Store (Amazon EBS) 
11 |   - Amazon Elastic File System (Amazon EFS) 
12 |   - Amazon FSx 
13 |   - Amazon S3 
14 | - Amazon S3 Glacier 
15 | - AWS Storage Gateway
16 | 
17 | **Database**
18 |  - Amazon Aurora 
19 |  - Amazon DynamoDB 
20 |  - Amazon ElastiCache 
21 |  - Amazon RDS 
22 |  - Amazon Redshift 
23 | 
24 | 
25 | **Compute**
26 |   - Amazon EC2 
27 |   - AWS Elastic Beanstalk 
28 |   - Amazon Elastic Container Service (Amazon ECS) 
29 |   - Amazon Elastic Kubernetes Service (Amazon EKS) 
30 |   - Elastic Load Balancing 
31 |   - AWS Fargate 
32 |   - AWS Lambda
33 | 
34 | **Management and Governance**
35 |  - AWS Auto Scaling 
36 | - AWS Backup 
37 | - AWS CloudFormation 
38 | - AWS CloudTrail 
39 | - Amazon CloudWatch 
40 | - AWS Config 
41 | - Amazon EventBridge (Amazon CloudWatch Events) 
42 | - AWS Organizations 
43 | - AWS Resource Access Manager 
44 | - AWS Systems Manager 
45 | - AWS Trusted Advisor 
46 | 
47 | 
48 | 
49 | **Migration and Transfer**
50 | - AWS Database Migration Service (AWS DMS) 
51 | - AWS DataSync 
52 | - AWS Migration Hub 
53 | - AWS Server Migration Service (AWS SMS) 
54 | - AWS Snowball 
55 | - AWS Transfer Family
56 | 
57 | **Application Integration**
58 |   - Amazon Simple Notification Service (Amazon SNS) 
59 |   - Amazon Simple Queue Service
60 | 
61 | 
62 | **Networking and Content Delivery**
63 | - Amazon API Gateway 
64 | - Amazon CloudFront 
65 | - AWS Direct Connect 
66 | - AWS Global Accelerator 
67 | - Amazon Route 53 
68 | - AWS Transit Gateway 
69 | - Amazon VPC 
70 | 
71 |  **Security, Identity, and Compliance**
72 |  - AWS Certificate Manager (ACM) 
73 | - AWS Directory Service 
74 | - Amazon GuardDuty
75 |  - AWS Identity and Access Management (IAM) 
76 | - Amazon Inspector 
77 | - AWS Key Management Service (AWS KMS) 
78 | - Amazon Macie 
79 | - AWS Secrets Manager 
80 | - AWS Shield 
81 | - AWS Single Sign-On 
82 | - AWS WAF 
83 | 
84 | **AWS Billing and Cost Management**
85 |    - AWS Budgets 
86 | - Cost Explorer 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/AWS/01 AWS Compute.md:
--------------------------------------------------------------------------------
 1 | Compute forms the nucleus of creating and executing business.
 2 | 
 3 | AWS provides several compute products that allows to deploy, run, and scale applications as virtual servers, code, or containers.
 4 | 
 5 | Compute Covers
 6 | 
 7 | 	• Simple websites and applications on one or a few servers
 8 | 	• Control and manage cluster or server level functions such as deployment and scaling
 9 | 	• Manage stateful or stateless applications packaged as Docker containers
10 | 	• Stateless, event-initiated applications that require fast response times
11 | 
12 | Different Compute services offered by AWS.
13 | 
14 | 	• Amazon EC2 - Virtual Servers in the Cloud
15 | 	• Amazon EC2 Container Service - Run and Manage Docker Containers
16 | 	• AWS Lambda - Run Code in Response to Events
17 | 	• Amazon EC2 Container Registry - Store and Retrieve Docker Images
18 | 	• Amazon Lightsail - Launch and Manage Virtual Private Servers
19 | 	• Amazon VPC - Isolated Cloud Resources
20 | 	• AWS Batch - Run Batch Jobs at Any Scale
21 | 	• AWS Elastic Beanstalk - Run and Manage Web Apps
22 | 	• Auto Scaling - Automatic Elasticity
23 | 
24 | **Amazon EC2**
25 | offers resizable cloud-based compute capability taking shape as virtual servers. 
26 | There are a broad range of instance types that are easily manageable and exhibit different combinations of networking capacity, storage size, amount of memory, and CPU power.
27 | 
28 | Features
29 | 
30 | 	• Removes the necessity of upfront investment on computer hardwares.
31 | 	• Commission numerous instances simultaneously
32 | 	• Pay only for the used quantity
33 | 	• Change web-scale cloud computing easy
34 |   
35 | Applications
36 | 
37 | 	• Big data - e.g. Hadoop
38 | 	• Database software - e.g. Aurora, DynamoDB
39 | 	• Enterprise applications - e.g. SAP, Oracle
40 | 	• Migrations from on-premises environments
41 | 	• Open-source cluster management
42 | 
43 | **Amazon EC2 Container Service (ECS)**
44 | 
45 | Amazon ECS is a scalable, performance container management service to include Docker containers. 
46 | It enables you to run applications at ease on a managed cluster of Amazon EC2 instances.
47 | Amazon ECS removes the necessity to install, run, and scale cluster management infrastructure.
48 | 
49 | Applications
50 | 	• Web applications
51 | 	• Microservices
52 | 	• Batch jobs
53 | 	• Docker workloads
54 | 
55 | **AWS Lambda**
56 | 
57 | AWS Lambda aims to run code without managing or provisioning servers.
58 | 
59 | 	• Lambda is the nucleus of serverless computing. So build and run services and applications without bothering about servers.
60 | 	• Run code for virtually any type of backend service or application - with no administration.
61 | 	• Upload the code and Lambda handles everything needed to run and scale the code with better availability.
62 | 	• Pay only for the compute time that you use.
63 | 	• Create code to automatically trigger from other AWS services, otherwise call it directly from a mobile or web app.
64 |   
65 | Applications
66 | 
67 | 	• Web applications
68 | 	• Mobile backends
69 | 	• IoT backends
70 | 	• Stream processing workloads and File processing workloads
71 | 


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/AWS/02 AWS Storage Services.md:
--------------------------------------------------------------------------------
 1 | Storage in AWS Cloud
 2 | 
 3 | Cloud storage is a vital component of cloud computing, including the information utilized by applications.
 4 | 
 5 | 	• Cloud storage is more secure, scalable, and reliable than conventional on-premises storage systems.
 6 | 	• AWS provides a complete set of cloud storage services to support archival and application compliance needs.
 7 | 	• Backup and archive applications, Databases, Internet of Things, Data warehouses, and Big data analytics depend on data storage architecture.
 8 | 
 9 | **Amazon S3**
10 | can be described as object storage that stores and retrieves any quantity of data from anywhere on the internet.
11 | Amazon S3 features a simple web service interface.
12 | 
13 | 	• Renders incredible durability,
14 | 	• Scales past trillions of objects globally,
15 | 	• Once data is saved in S3, it can be tiered automatically into minimal cost, longe-term cloud storage classes such as Amazon Glacier and S3 Standard - Infrequent Access for archiving.
16 | 	• Serves as a target for backup and recovery, and disaster recovery; "data lake" for Big Data analytics, tier in an active archive and bulk repository for user-generated content.
17 | 
18 | **Amazon Glacier**
19 | 
20 | Amazon Glacier is a durable, secure and economical cloud storage service for long-term backup and data archiving.
21 | 
22 | 	• Customers can safely store any data for as low as $0.004 per gigabyte per month.
23 | 	• Considerable amount of savings in contrast to on-premises solutions.
24 | 	• Amazon Glacier offers three choices to access archives for a few minutes to numerous hours.
25 |   
26 | **Amazon Elastic File System**
27 | 
28 | Amazon EFS offers simple, scalable file storage that can be utilized with Amazon EC2 instances in AWS Cloud.
29 | 	• Amazon EFS is user-friendly and provides a simple interface allowing you to create and configure file systems effortlessly and fast.
30 | 	• Storage capacity is elastic i.e. that shrinks or grows automatically.
31 | 	• Several Amazon EC2 instances can access an Amazon EFS file system simultaneously, letting Amazon EFS to offer a common data source for workloads as well as applications operating on more than one Amazon EC2 instance.
32 | Application
33 | Amazon EFS is offered for better durability and availability for Big Data and analytics applications, container storage, web and content serving, media processing workflows, and enterprise applications.
34 | 
35 | 


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/AWS/03 - AWS database services.md:
--------------------------------------------------------------------------------
 1 | Database in AWS Cloud
 2 | 
 3 | AWS provides a broad range of database services to suit all application needs. These database services can be started quickly and are fully manageable.
 4 | 
 5 | AWS database services include:
 6 | 
 7 | 	• Amazon Relational Database Service (Amazon RDS) - supporting six widely used database engines
 8 | 	• Amazon DynamoDB - a quick and flexible NoSQL database service,
 9 | 	• Amazon Aurora - a MySQL-compatible relational database that delivers five times the performance
10 | 	• Amazon Elasticache - an in-memory cache service with Memcached and Redis support.
11 | 	• Amazon Redshift - a petabyte-scale data warehouse service.
12 |   
13 | AWS provides AWS Database Migration Service - a service that allows easy and economical to migrate databases to AWS cloud.
14 | 
15 | **Amazon RDS**
16 | 
17 | Amazon RDS is very easy to set up, run, and scale a relational database in the cloud.
18 | 
19 | 	• offers resizable and low-cost capacity while managing database administration tasks that consume more time,
20 | 	• Amazon RDS offers six similar database engines to select from, including Microsoft SQL Server, Oracle, MariaDB, Amazon Aurora, MySQL, and PostgreSQL.
21 | 
22 | **Dynamo DB**
23 | 
24 | Amazon DynamoDB is a flexible and quick NoSQL database service for all applications that require single-digit, consistent millisecond latency at any scale.
25 | 
26 | 	• Completely cloud managed database, supporting document and key-value store models.
27 | 	• reliable performance and flexible data model makes it suitable for several applications such as IoT, mobile, ad tech, web, and gaming.
28 |   
29 | According to Forrester, Amazon DynamoDB is the most popular NoSQL cloud database
30 | 


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/AWS/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/AWS/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/Azure/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/Azure/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/CICD/Git/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/CICD/Git/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/CICD/Jenkins/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/CICD/Jenkins/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/CICD/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/CICD/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/DSA/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/DSA/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/Databricks/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/Databricks/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/Design Principles/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/Design Principles/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/GCP/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/GCP/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/NOSQL/CosmoDB/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/NOSQL/CosmoDB/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/NOSQL/DynamoDB/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/NOSQL/DynamoDB/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/NOSQL/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/NOSQL/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/Orchestration/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/Orchestration/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/Performance Tuning/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/Performance Tuning/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/Snowflake/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/Snowflake/README.md


--------------------------------------------------------------------------------
/03 - Data Engineering Advanced/Streaming/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/03 - Data Engineering Advanced/Streaming/README.md


--------------------------------------------------------------------------------
/04 - Data Engineering Projects/README.md:
--------------------------------------------------------------------------------
1 | # python_session


--------------------------------------------------------------------------------
/05 - Interview Questions/01 - Interview Questions.md:
--------------------------------------------------------------------------------
 1 | 1. Kindly explain your project architecture?
 2 | 2. Day to Day activity in office
 3 | 3. What is the size of data you deal with on daily basis?
 4 | 4. What is Repartition and Coalesce?
 5 | 5. What optimisation techniques have you used in your project?
 6 | 6. What is your role in your project?
 7 | 7. What is the most challenging problem you have solved in your big data project?
 8 | 8. Can you explain what happens internally when we submit a Spark job using Spark-Submit?
 9 | 9. What is a catalyst optimiser?
10 | 10. What is the size of your Spark cluster and the configuration of each node?
11 | 11. How to tune a spark job? Please explain the techniques we can try.
12 | 12. What is Repartition and Coalesce?
13 | 13. What is Spark Context vs Spark Session?
14 | 14. Role in your current project
15 | 15. Difference between dataset & dataframe
16 | 16. Difference between broadcast variable & accumulator
17 | 17. What is broadcast Join?
18 | 18. Types of transformation and difference
19 | 19. What are operations of data frames
20 | 20. Explain Spark on Yarn Architecture
21 | 21. Why reduce is action & reduceByKey transformation
22 | 22. What is cache & persist in Spark
23 | 23. Difference between RDD & Dataframes
24 | 24. What are the challenges you face in spark?
25 | 25. How spark is better than Hive?
26 | 26. How to enforce schema on a data frame?
27 | 27. What is difference between reduceByKey & groupByKey?
28 | 28. How do we submit jar files in Spark?
29 | 


--------------------------------------------------------------------------------
/05 - Interview Questions/02- Interview questions.md:
--------------------------------------------------------------------------------
 1 |  1. What is the mechanism by which you can resubmit failed spark job automatically?
 2 |  2. What is Left Semi Join, Left Anti Join in spark?
 3 |  3. How Spark ensures fault tolerance?
 4 |  4. What is the difference between GroupByKey and ReduceByKey?
 5 |  5. What are wide and narrow transformations?/ when does stage boundary occur?
 6 |  6. What is broadcast variable and accumulator?
 7 |  7. How to broadcast a small data frame?
 8 |  8. What is RDD?
 9 |  9. How to create RDD?
10 | 10. Have you done any Optimization in spark?
11 | 11. How to read XML files using MapReduce?
12 | 12. What is the difference between HiveQL and SparkSQL?
13 | 13. What is the difference between SQL and HiveQL?
14 | 14. What is DAG and what is its use of it?
15 | 15. Given, we have a transformation,. actions, transformations again and one more action, how many DAGS will be there? / What is the output of transformation and actions?
16 | 16. What are the options that can be specified along with the spark-submit script for memory allocation?/ How do we specify class while submitting an application /How will you specify parallelism in spark?
17 | 17. What are various deploy modes in spark-submit?
18 | 18. Difference between yarn-client and yarn-cluster mode?
19 | 19. What is your understanding of specifying “total-executor cores “ while submitting the spark job
20 | 20. How to dynamically control the number of executors?
21 | 21. What is the difference between partition and partitioner?
22 | 22. How to find the spark version
23 | 23. If on the cluster, you are getting a memory error, how will you resolve it?
24 | 24. What is transformation and what is the action? What difference between them?
25 | 25. What transformations you have used?
26 | 26. What actions you have used?
27 | 27. What is the Difference between apache spark and apache storm?
28 | 28. What is Lazy Evaluation in Apache Spark?
29 | 29. What is the difference between cache and persist?
30 | 


--------------------------------------------------------------------------------
/05 - Interview Questions/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/05 - Interview Questions/README.md


--------------------------------------------------------------------------------
/06 - Productivity Tips/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/06 - Productivity Tips/README.md


--------------------------------------------------------------------------------
/07 - DE Best Linkedin Posts Links/01 Best Linkedin Post for DE.md:
--------------------------------------------------------------------------------
 1 | - **Spark Optimization**
 2 | 
 3 |       https://www.linkedin.com/feed/update/urn:li:activity:7054036305281748992/
 4 |       https://www.linkedin.com/feed/update/urn:li:activity:7008294500754165760/?updateEntityUrn=urn%3Ali%3Afs_feedUpdate%3A%28V2%2Curn%3Ali%3Aactivity%3A7008294500754165760%29
 5 |       
 6 | - **Why Lakeshouse?**
 7 |      
 8 |      https://www.linkedin.com/feed/update/urn:li:activity:7053974734761340929/
 9 |      
10 |     
11 | - **Object Storage Vs File Storage**
12 | 
13 |      https://www.linkedin.com/feed/update/urn:li:activity:7054027530697138176/ 
14 | 
15 |      
16 | - **Data Skew in APache Spark**
17 | 
18 |      https://www.linkedin.com/feed/update/urn:li:activity:7053416679162200064/
19 |           
20 | -  **Spark COnfigurations**
21 | 
22 |      https://www.linkedin.com/feed/update/urn:li:activity:7051796102588579840/
23 |      
24 | - **Normalization Vs Denormalization**
25 | 
26 |      https://www.linkedin.com/posts/bigdatabysumit_dataanalysis-dataengineering-dbms-activity-6988859629183123456-dR3C/?utm_source=share&utm_medium=member_desktop
27 |      
28 | - **6 Delta Engine Optimizations in Databricks**
29 | 
30 |     https://www.linkedin.com/posts/bigdatabysumit_bigdata-dataengineering-apachespark-activity-6988841755009830912-edyM/?utm_source=share&utm_medium=member_desktop
31 |  
32 |  
33 | -  **Complete Data Warehousing concepts**
34 |        
35 |      https://www.linkedin.com/feed/update/urn:li:activity:7053600934710448128/
36 |        
37 | - **End to End Pipeline flow**
38 | 
39 |      https://www.linkedin.com/posts/bigdatabysumit_bigdata-dataengineering-apachespark-activity-6986697225535971329-qNwa/?utm_source=share&utm_medium=member_desktop
40 |      
41 | - **Azure Data Engineeering roadmap**
42 |  
43 |      https://www.linkedin.com/feed/update/urn:li:activity:7053616549487661056/
44 |     
45 | - **Azure Data Engineer Preparation guide**
46 |     https://www.linkedin.com/feed/update/urn:li:activity:7051809515054252032/
47 |     
48 |  - **Data Engineering Projects**
49 | 
50 |      https://www.linkedin.com/feed/update/urn:li:activity:7052512734155853824/
51 |      https://www.linkedin.com/feed/update/urn:li:activity:7018184970011615233/
52 |      https://www.linkedin.com/feed/update/urn:li:activity:7048326060605599745/
53 |      https://www.linkedin.com/feed/update/urn:li:activity:7053781085008101377/
54 |      https://www.linkedin.com/feed/update/urn:li:activity:7011543533010509824/
55 |      
56 |  - **AWS Data Engineer**
57 |      https://www.linkedin.com/feed/update/urn:li:activity:7051422119175352320/
58 |      https://www.linkedin.com/feed/update/urn:li:activity:7008326785633312768/
59 |      
60 |       
61 | - **Apache Spark Series**
62 |     https://www.linkedin.com/feed/update/urn:li:activity:7048627816623476736/
63 |     
64 | - **Hive Optimization**
65 |      
66 |   https://www.linkedin.com/feed/update/urn:li:activity:7045261720075857920/?updateEntityUrn=urn%3Ali%3Afs_feedUpdate%3A%28V2%2Curn%3Ali%3Aactivity%3A7045261720075857920%29
67 |   
68 |  - **Databricks DE Associate**
69 |    https://www.linkedin.com/feed/update/urn:li:activity:7015225769610149888/
70 |    https://www.linkedin.com/feed/update/urn:li:activity:7034379774881906688/?updateEntityUrn=urn%3Ali%3Afs_feedUpdate%3A%28V2%2Curn%3Ali%3Aactivity%3A7034379774881906688%29
71 |    
72 |    -  **Design modern datawarehouse with data lake**
73 |    https://www.linkedin.com/feed/update/urn:li:activity:7045233067963101184/
74 |      
75 |   - **ETL Vs ELT**
76 |   https://www.linkedin.com/feed/update/urn:li:activity:7006970102575239168/?updateEntityUrn=urn%3Ali%3Afs_feedUpdate%3A%28V2%2Curn%3Ali%3Aactivity%3A7006970102575239168%29   
77 | 


--------------------------------------------------------------------------------
/07 - DE Best Linkedin Posts Links/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/07 - DE Best Linkedin Posts Links/README.md


--------------------------------------------------------------------------------
/08 - DE Best Youtube Channels Links/01 - Youtube Links.md:
--------------------------------------------------------------------------------
 1 | **SQL Playlist - To learn SQL**
 2 | 
 3 |    https://www.youtube.com/watch?v=zAOUpVM6R6I&list=PLtgiThe4j67rAoPmnCQmcgLS4iIc5ungg
 4 |    
 5 | **Snowflake Playlist - For beginners**
 6 | 
 7 |    https://www.youtube.com/watch?v=AR88dZG-hwo&list=PLba2xJ7yxHB73xHFsyu0YViu3Hi6Ckxzj 
 8 |    
 9 |  **Azure Fundamentals**
10 |  
11 | https://www.youtube.com/watch?v=hgon34zYJIo&list=PLL_oDTv-DAl9Eh9fY-J4MHpoOyRl9Kbj5
12 | 
13 | **Devops**
14 | 
15 | https://www.youtube.com/watch?v=414LePbveVQ&list=PLdpzxOOAlwvIKMhk8WhzN1pYoJ1YU8Csa
16 | 
17 | **DE Projects**
18 | 
19 | https://www.youtube.com/@DarshilParmar/playlists
20 | 
21 | **AWS DE Projects**
22 | 
23 | https://www.youtube.com/@DataEngUncomplicated/playlists
24 | 
25 | **SQL Advanced**
26 | 
27 | https://www.youtube.com/@ankitbansal6/playlists
28 | 
29 | **Datawarehousing concepts**
30 | 
31 | https://www.youtube.com/watch?v=CzCrfE6ofZM
32 | 
33 | **Learn DE in Tamil**
34 | 
35 | https://www.youtube.com/@dataengineeringvideos/playlists
36 | 
37 | **Booster**:
38 | 
39 | https://www.youtube.com/@SeattleDataGuy/playlists
40 | https://www.youtube.com/@DatawithZach
41 |  
42 | 


--------------------------------------------------------------------------------
/08 - DE Best Youtube Channels Links/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/08 - DE Best Youtube Channels Links/README.md


--------------------------------------------------------------------------------
/09 - DE Best Books Lists & summary/01 Best Books Lists for DE.md:
--------------------------------------------------------------------------------
1 | - *'Fundamentals of Data Engineering' by Joe Reis & Matthew Housley*
2 | 
3 | - *'The Data Warehouse Toolkit' by Ralph Kimball & Margy Ross*
4 | 
5 | - *'Designing Data-Intensive Applications' by Martin Kleppmann*
6 | 


--------------------------------------------------------------------------------
/09 - DE Best Books Lists & summary/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/09 - DE Best Books Lists & summary/README.md


--------------------------------------------------------------------------------
/10 - Data Engineering Certifications/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/10 - Data Engineering Certifications/README.md


--------------------------------------------------------------------------------
/11 - DE Best Articles Links/01 AWS Reference.md:
--------------------------------------------------------------------------------
1 | **AWS S3 Event Triggering**
2 | 
3 | https://awstip.com/aws-s3-event-triggering-shell-script-used-by-netflix-airbnb-adobe-expedia-and-others-53b6822e2192
4 | https://github.com/writetoritika/shell-scripting-projects/tree/main/aws-event-triggering
5 | 


--------------------------------------------------------------------------------
/11 - DE Best Articles Links/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/11 - DE Best Articles Links/README.md


--------------------------------------------------------------------------------
/12 - DE Important Reference Documents/Azure data lake.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/12 - DE Important Reference Documents/Azure data lake.pdf


--------------------------------------------------------------------------------
/12 - DE Important Reference Documents/DSA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/12 - DE Important Reference Documents/DSA.pdf


--------------------------------------------------------------------------------
/12 - DE Important Reference Documents/Database design concepts.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JagadeeshwaranM/Data_Engineering_Simplified/f81b64e1ec690014211fe8a53d1a7ae13fc8ee83/12 - DE Important Reference Documents/Database design concepts.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | **Data Engineering Roadmap**
 2 | 
 3 | 1. Learn SQL...
 4 | Aggregations with GROUP BY
 5 | Joins (INNER, LEFT, FULL OUTER)
 6 | Window functions
 7 | Common table expressions etc.
 8 | 
 9 | You can learn from https://www.w3schools.com/
10 | 
11 | 2. Learn python/Scala.....
12 | Learn basics for/while/if loops,
13 | functional programming, abstract methods, traits 
14 | Learn libraries like numpy, pandas, scikit-learn etc.
15 | 
16 | you can learn https://lnkd.in/gSz45km5
17 | 
18 | 3. Learn distributed computing...
19 | Hadoop versions/hadoop architecture
20 | fault tolerance in hadoop
21 | Read/understand about Mapreduce processing.
22 | learn optimizations used in mapreduce etc.
23 | 
24 | 4. Learn data ingestion tools...
25 | Learn Sqoop/ Kafka/NIFi
26 | Understand their functionality and job running mechanism.
27 | 
28 | 5. Learn data processing/NOSQL....
29 | Spark architecture/ RDD/Dataframes/datasets.
30 | lazy evaluation, DAGs/ Lineage graph/optimization techniques
31 | YARN utilization/ spark streaming etc.
32 | 
33 | 6. Learn data warehousing.....
34 | Understand how HIve store and process the data
35 | different File formats/ compression Techniques.
36 | partitioning/ Bucketing.
37 | different UDF's available in Hive.
38 | SCD concepts.
39 | Ex Hbase. cassandra
40 | 
41 | 7. Learn job Orchestration...
42 | Learn Airflow/Oozie
43 | learn about workflow/ CRON etc.
44 | 
45 | 8. Learn Cloud Computing....
46 | Learn Azure/AWS/ GCP.
47 | understand the significance of Cloud in #dataengineering 
48 | Learn Azure synapse/Redshift/Big query
49 | Learn Ingestion tools/pipeline tools like ADF etc.
50 | 
51 | 9. Learn basics of CI/ CD and Linux commands....
52 | Read about Kubernetes/Docker. And how crucial they are in data.
53 | 


--------------------------------------------------------------------------------