├── Chapter-2
├── main.py
├── requirements.txt
├── push_to_database.py
├── push_to_blob.py
└── scrape.py
├── LICENSE
└── README.md
/Chapter-2/main.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | os.system('python push_to_blob.py')
4 | os.system('python push_to_database.py')
5 |
--------------------------------------------------------------------------------
/Chapter-2/requirements.txt:
--------------------------------------------------------------------------------
1 | azure-storage-blob==12.17.0
2 | beautifulsoup4==4.12.2
3 | numpy==1.25.2
4 | pandas==2.0.3
5 | psycopg2==2.9.7
6 | pyarrow==12.0.1
7 | Requests==2.31.0
8 | SQLAlchemy==2.0.20
9 | python-dotenv==1.0.0
10 |
--------------------------------------------------------------------------------
/Chapter-2/push_to_database.py:
--------------------------------------------------------------------------------
1 | from scrape import *
2 | import pandas as pd
3 | from sqlalchemy import create_engine
4 | import psycopg2
5 | from dotenv import load_dotenv
6 | import os
7 |
8 | load_dotenv()
9 |
10 | functions = [league_table,top_scorers,detail_top,player_table,all_time_table,all_time_winner_club,top_scorers_seasons,goals_per_season]
11 |
12 |
13 | conn_string = os.getenv('CONN_STRING')
14 |
15 | db = create_engine(conn_string)
16 | conn = db.connect()
17 | for fun in functions:
18 | function_name = fun.__name__
19 | result_df = fun() # Call the function to get the DataFrame
20 | result_df.to_sql(function_name, con=conn, if_exists='replace', index=False)
21 | print(f'Pushed data for {function_name}')
22 |
23 | # Close the database connection
24 | conn.close()
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Packt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Chapter-2/push_to_blob.py:
--------------------------------------------------------------------------------
1 | from scrape import *
2 | import pandas as pd
3 | import pyarrow as pa
4 | import pyarrow.parquet as pq
5 | from io import BytesIO
6 | from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
7 | #from dotenv import load_dotenv
8 | import os
9 |
10 | #load_dotenv()
11 |
12 | functions = [league_table,top_scorers,detail_top,player_table,all_time_table,all_time_winner_club,top_scorers_seasons,goals_per_season]
13 |
14 | def to_blob(func):
15 |
16 | '''
17 | Converts the output of a given function to Parquet format and uploads it to Azure Blob Storage.
18 | Args:
19 | func (function): The function that retrieves data to be processed and uploaded.
20 | Returns:
21 | None
22 | This function takes a provided function, calls it to obtain data, and then converts the data into
23 | an Arrow Table. The Arrow Table is serialized into Parquet format and uploaded to an Azure Blob
24 | Storage container specified in the function. The function's name is used as the blob name.
25 | Example:
26 | Consider the function "top_scorers". Calling "to_blob(top_scorers)" will process the output
27 | of "top_scorers", convert it to Parquet format, and upload it to Azure Blob Storage.
28 | '''
29 |
30 | file_name = func.__name__
31 | func = func()
32 |
33 |
34 | # Convert DataFrame to Arrow Table
35 | table = pa.Table.from_pandas(func)
36 |
37 | parquet_buffer = BytesIO()
38 | pq.write_table(table, parquet_buffer)
39 |
40 | connection_string = 'Insert your blob storage connection key here'
41 | blob_service_client = BlobServiceClient.from_connection_string(connection_string)
42 |
43 | container_name = "testtech"
44 | blob_name = f"{file_name}.parquet"
45 | container_client = blob_service_client.get_container_client(container_name)
46 |
47 | blob_client = container_client.get_blob_client(blob_name)
48 | blob_client.upload_blob(parquet_buffer.getvalue(), overwrite=True)
49 | print(f"{blob_name} successfully updated")
50 |
51 |
52 | for items in functions:
53 | to_blob(items)
54 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Cracking the Data Engineering Interview
2 |
3 |
4 |
5 | This is the code repository for [Cracking the Data Engineering Interview](https://www.packtpub.com/product/cracking-the-data-engineering-interview/9781837630776), published by Packt.
6 |
7 | **Land your dream job with the help of resume-building tips, over 100 mock questions, and a unique portfolio**
8 |
9 | ## What is this book about?
10 | Preparing for a data engineering interview can often get overwhelming due to the abundance of tools and technologies, leaving you struggling to prioritize which ones to focus on. This hands-on guide provides you with the essential foundational and advanced knowledge needed to simplify your learning journey.
11 |
12 |
13 | This book covers the following exciting features:
14 | * Create maintainable and scalable code for unit testing
15 | * Understand the fundamental concepts of core data engineering tasks
16 | * Prepare with over 100 behavioral and technical interview questions
17 | * Discover data engineer archetypes and how they can help you prepare for the interview
18 | * Apply the essential concepts of Python and SQL in data engineering
19 | * Build your personal brand to noticeably stand out as a candidate
20 |
21 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1837630771) today!
22 |
23 |
24 |
25 | ## Instructions and Navigations
26 | All of the code is organized into folders.
27 |
28 | The code will look like the following:
29 | ```
30 | from scrape import *
31 | import pandas as pd
32 | from sqlalchemy import create_engine
33 | import psycopg2
34 | ```
35 |
36 | **Following is what you need for this book:**
37 | If you’re an aspiring data engineer looking for guidance on how to land, prepare for, and excel in data engineering interviews, this book is for you. Familiarity with the fundamentals of data engineering, such as data modeling, cloud warehouses, programming (python and SQL), building data pipelines, scheduling your workflows (Airflow), and APIs, is a prerequisite.
38 |
39 | With the following software and hardware list you can run all code files present in the book (Chapter 1-16).
40 |
41 | ### Software and Hardware List
42 |
43 | | Chapter | Software required | OS required |
44 | | -------- | ------------------------------------------------------------------------------------| -----------------------------------|
45 | | 2 | Microsoft Azure | Windows, Mac OS X, and Linux (Any) |
46 | | 2 | Amazon Web Services | Windows, Mac OS X, and Linux (Any)|
47 |
48 | ### Related products
49 | * Data Wrangling with SQL [[Packt]](https://www.packtpub.com/product/data-wrangling-with-sql/9781837630028) [[Amazon]](https://www.amazon.com/dp/183763002X)
50 |
51 | * SQL Query Design Patterns and Best Practices [[Packt]](https://www.packtpub.com/product/sql-query-design-patterns-and-best-practices/9781837633289) [[Amazon]](https://www.amazon.com/dp/1837633282)
52 |
53 | ## Get to Know the Authors
54 | **Kedeisha Bryan** is a data professional with experience in data analytics, science, and engineering. She has prior experience combining both Six Sigma and analytics to provide data solutions that have impacted policy changes and leadership decisions. She is fluent in tools such as SQL, Python, and Tableau.
55 | She is the founder and leader at the Data in Motion Academy, providing personalized skill development, resources, and training at scale to aspiring data professionals across the globe. Her other works include another Packt book in the works and an SQL course for LinkedIn Learning.
56 |
57 | **Taamir Ransome** is a Data Scientist and Software Engineer. He has experience in building machine learning and artificial intelligence solutions for the US Army. He is also the founder of the Vet Dev Institute, where he currently provides cloud-based data solutions for clients. He holds a master’s degree in Analytics from Western Governors University.
58 |
--------------------------------------------------------------------------------
/Chapter-2/scrape.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import pandas as pd
4 | import numpy as np
5 |
6 | def league_table():
7 |
8 | url = 'https://www.bbc.com/sport/football/premier-league/table'
9 | headers = []
10 | page = requests.get(url)
11 | soup = BeautifulSoup(page.text, "html.parser")
12 | table= soup.find("table", class_="ssrcss-14j0ip6-Table e3bga5w5")
13 |
14 | for i in table.find_all('th'):
15 | title = i.text
16 | headers.append(title)
17 | league_table = pd.DataFrame(columns = headers)
18 | for j in table.find_all('tr')[1:]:
19 | row_data = j.find_all('td')
20 | row = [i.text for i in row_data]
21 | length = len(league_table)
22 | league_table.loc[length] = row
23 | league_table.drop(["Form, Last 6 games, Oldest first"], axis=1, inplace=True)
24 | return league_table
25 |
26 | def top_scorers():
27 | url = 'https://www.bbc.com/sport/football/premier-league/top-scorers'
28 | headers = []
29 | page = requests.get(url)
30 | soup = BeautifulSoup(page.text, "html.parser")
31 | table= soup.find("table", class_="gs-o-table")
32 |
33 | for i in table.find_all('th'):
34 | title = i.text
35 | headers.append(title)
36 | top_scorers = pd.DataFrame(columns = headers)
37 | for j in table.find_all('tr')[1:]:
38 | row_data = j.find_all('td')
39 | row = [i.text for i in row_data]
40 | length = len(top_scorers)
41 | top_scorers.loc[length] = row
42 |
43 | top_scorers.Name = top_scorers.Name.replace(r'([A-Z])', r' \1', regex=True).str.split()
44 | top_scorers.Name = top_scorers.Name.apply(lambda x: ' '.join(dict.fromkeys(x).keys()))
45 |
46 | top_scorers['Club'] = top_scorers.Name.str.split().str[2:].str.join(' ')
47 | top_scorers.Name = top_scorers.Name.str.split().str[:2].str.join(' ')
48 | col = top_scorers.pop("Club")
49 | top_scorers.insert(2, 'Club', col)
50 | top_scorers.Club = top_scorers.Club.apply(lambda x: 'Manchester City' if 'Manchester City' in x else x)
51 | top_scorers.Club = top_scorers.Club.apply(lambda x: 'Manchester United' if 'Manchester United' in x else x)
52 | top_scorers.Club = top_scorers.Club.apply(lambda x: 'Brighton & Hove Albion' if 'Brighton & Hove Albion' in x else x)
53 |
54 | return top_scorers
55 |
56 | def detail_top():
57 | url = 'https://www.worldfootball.net/goalgetter/eng-premier-league-2023-2024/'
58 | headers = []
59 | page = requests.get(url)
60 | soup = BeautifulSoup(page.text, "html.parser")
61 | table= soup.find("table", class_="standard_tabelle")
62 |
63 | for i in table.find_all('th'):
64 | title = i.text
65 | headers.append(title)
66 | detail_top_scorer = pd.DataFrame(columns = headers)
67 | for j in table.find_all('tr')[1:]:
68 | row_data = j.find_all('td')
69 | row = [i.text for i in row_data]
70 | length = len(detail_top_scorer)
71 | detail_top_scorer.loc[length] = row
72 |
73 | detail_top_scorer = detail_top_scorer.drop([''],axis=1)
74 | detail_top_scorer.Team = detail_top_scorer.Team.str.replace('\n\n','')
75 | detail_top_scorer['Penalty'] = detail_top_scorer['Goals (Penalty)'].str.split().str[-1:].str.join(' ')
76 | detail_top_scorer['Penalty'] = detail_top_scorer['Penalty'].str.replace('(','')
77 | detail_top_scorer['Penalty'] = detail_top_scorer['Penalty'].str.replace(')','')
78 | detail_top_scorer['Goals (Penalty)'] = detail_top_scorer['Goals (Penalty)'].str.split().str[0].str.join('')
79 | detail_top_scorer.rename(columns = {'Goals (Penalty)':'Goals'}, inplace = True)
80 | detail_top_scorer = detail_top_scorer.drop(['#'], axis = 1)
81 | return detail_top_scorer
82 |
83 | def player_table():
84 | url = [f'https://www.worldfootball.net/players_list/eng-premier-league-2023-2024/nach-name/{i:d}' for i in (range(1, 12))]
85 | header = ['Player','','Team','born','Height','Position']
86 | df = pd.DataFrame(columns=header)
87 | def player(ev):
88 | url = ev
89 | headers = []
90 | page = requests.get(url)
91 | soup = BeautifulSoup(page.text, "html.parser")
92 | table= soup.find("table", class_="standard_tabelle")
93 |
94 | for i in table.find_all('th'):
95 | title = i.text
96 | headers.append(title)
97 | players = pd.DataFrame(columns = headers)
98 | for j in table.find_all('tr')[1:]:
99 | row_data = j.find_all('td')
100 | row = [i.text for i in row_data]
101 | length = len(players)
102 | players.loc[length] = row
103 | return players
104 |
105 | for i in url:
106 | a = player(i)
107 | df = pd.concat([df, a], axis=0).reset_index(drop=True)
108 |
109 | df = df.drop([''], axis=1)
110 | return df
111 |
112 | def all_time_table():
113 | url = 'https://www.worldfootball.net/alltime_table/eng-premier-league/pl-only/'
114 | headers = ['pos','#','Team','Matches','wins','Draws','Losses','Goals','Dif','Points']
115 | page = requests.get(url)
116 | soup = BeautifulSoup(page.text, "html.parser")
117 | table= soup.find("table", class_="standard_tabelle")
118 |
119 |
120 | alltime_table= pd.DataFrame(columns = headers)
121 | for j in table.find_all('tr')[1:]:
122 | row_data = j.find_all('td')
123 | row = [i.text for i in row_data]
124 | length = len(alltime_table)
125 | alltime_table.loc[length] = row
126 |
127 | alltime_table = alltime_table.drop(['#'], axis=1)
128 | alltime_table.Team = alltime_table.Team.str.replace('\n', '')
129 | return alltime_table
130 |
131 | def all_time_winner_club():
132 | url = 'https://www.worldfootball.net/winner/eng-premier-league/'
133 | headers = []
134 | page = requests.get(url)
135 | soup = BeautifulSoup(page.text, "html.parser")
136 | table= soup.find("table", class_="standard_tabelle")
137 |
138 | for i in table.find_all('th'):
139 | title = i.text
140 | headers.append(title)
141 | winners = pd.DataFrame(columns = headers)
142 | for j in table.find_all('tr')[1:]:
143 | row_data = j.find_all('td')
144 | row = [i.text for i in row_data]
145 | length = len(winners)
146 | winners.loc[length] = row
147 |
148 | winners = winners.drop([''], axis=1)
149 | winners['Year'] = winners['Year'].str.replace('\n', '')
150 | return winners
151 |
152 |
153 | def top_scorers_seasons():
154 | url = 'https://www.worldfootball.net/top_scorer/eng-premier-league/'
155 | headers = ['Season','#','Top scorer','#','Team','goals']
156 | page = requests.get(url)
157 | soup = BeautifulSoup(page.text, "html.parser")
158 | table= soup.find("table", class_="standard_tabelle")
159 | winners = pd.DataFrame(columns = headers)
160 | for j in table.find_all('tr')[1:]:
161 | row_data = j.find_all('td')
162 | row = [i.text for i in row_data]
163 | length = len(winners)
164 | winners.loc[length] = row
165 |
166 | winners = winners.drop(['#'], axis=1)
167 | winners=winners.replace('\\n','',regex=True).astype(str)
168 | winners['Season'] = winners['Season'].replace('', np.nan).ffill()
169 | return winners
170 |
171 | def goals_per_season():
172 | url = 'https://www.worldfootball.net/stats/eng-premier-league/1/'
173 | headers = []
174 | page = requests.get(url)
175 | soup = BeautifulSoup(page.text, "html.parser")
176 | table= soup.find("table", class_="standard_tabelle")
177 |
178 | for i in table.find_all('th'):
179 | title = i.text
180 | headers.append(title)
181 | goals_per_season = pd.DataFrame(columns = headers)
182 | for j in table.find_all('tr')[1:]:
183 | row_data = j.find_all('td')
184 | row = [i.text for i in row_data]
185 | length = len(goals_per_season)
186 | goals_per_season.loc[length] = row
187 | goals_per_season.drop(goals_per_season.index[-1],inplace=True)
188 |
189 | goals_per_season = goals_per_season.drop(['#'], axis=1)
190 | goals_per_season.rename(columns = {'goals':'Goals','Ø goals':'Average Goals'}, inplace = True)
191 |
192 | return goals_per_season
193 |
--------------------------------------------------------------------------------