├── Chapter-2 ├── main.py ├── requirements.txt ├── push_to_database.py ├── push_to_blob.py └── scrape.py ├── LICENSE └── README.md /Chapter-2/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.system('python push_to_blob.py') 4 | os.system('python push_to_database.py') 5 | -------------------------------------------------------------------------------- /Chapter-2/requirements.txt: -------------------------------------------------------------------------------- 1 | azure-storage-blob==12.17.0 2 | beautifulsoup4==4.12.2 3 | numpy==1.25.2 4 | pandas==2.0.3 5 | psycopg2==2.9.7 6 | pyarrow==12.0.1 7 | Requests==2.31.0 8 | SQLAlchemy==2.0.20 9 | python-dotenv==1.0.0 10 | -------------------------------------------------------------------------------- /Chapter-2/push_to_database.py: -------------------------------------------------------------------------------- 1 | from scrape import * 2 | import pandas as pd 3 | from sqlalchemy import create_engine 4 | import psycopg2 5 | from dotenv import load_dotenv 6 | import os 7 | 8 | load_dotenv() 9 | 10 | functions = [league_table,top_scorers,detail_top,player_table,all_time_table,all_time_winner_club,top_scorers_seasons,goals_per_season] 11 | 12 | 13 | conn_string = os.getenv('CONN_STRING') 14 | 15 | db = create_engine(conn_string) 16 | conn = db.connect() 17 | for fun in functions: 18 | function_name = fun.__name__ 19 | result_df = fun() # Call the function to get the DataFrame 20 | result_df.to_sql(function_name, con=conn, if_exists='replace', index=False) 21 | print(f'Pushed data for {function_name}') 22 | 23 | # Close the database connection 24 | conn.close() 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Chapter-2/push_to_blob.py: -------------------------------------------------------------------------------- 1 | from scrape import * 2 | import pandas as pd 3 | import pyarrow as pa 4 | import pyarrow.parquet as pq 5 | from io import BytesIO 6 | from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient 7 | #from dotenv import load_dotenv 8 | import os 9 | 10 | #load_dotenv() 11 | 12 | functions = [league_table,top_scorers,detail_top,player_table,all_time_table,all_time_winner_club,top_scorers_seasons,goals_per_season] 13 | 14 | def to_blob(func): 15 | 16 | ''' 17 | Converts the output of a given function to Parquet format and uploads it to Azure Blob Storage. 18 | Args: 19 | func (function): The function that retrieves data to be processed and uploaded. 20 | Returns: 21 | None 22 | This function takes a provided function, calls it to obtain data, and then converts the data into 23 | an Arrow Table. The Arrow Table is serialized into Parquet format and uploaded to an Azure Blob 24 | Storage container specified in the function. The function's name is used as the blob name. 25 | Example: 26 | Consider the function "top_scorers". Calling "to_blob(top_scorers)" will process the output 27 | of "top_scorers", convert it to Parquet format, and upload it to Azure Blob Storage. 28 | ''' 29 | 30 | file_name = func.__name__ 31 | func = func() 32 | 33 | 34 | # Convert DataFrame to Arrow Table 35 | table = pa.Table.from_pandas(func) 36 | 37 | parquet_buffer = BytesIO() 38 | pq.write_table(table, parquet_buffer) 39 | 40 | connection_string = 'Insert your blob storage connection key here' 41 | blob_service_client = BlobServiceClient.from_connection_string(connection_string) 42 | 43 | container_name = "testtech" 44 | blob_name = f"{file_name}.parquet" 45 | container_client = blob_service_client.get_container_client(container_name) 46 | 47 | blob_client = container_client.get_blob_client(blob_name) 48 | blob_client.upload_blob(parquet_buffer.getvalue(), overwrite=True) 49 | print(f"{blob_name} successfully updated") 50 | 51 | 52 | for items in functions: 53 | to_blob(items) 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cracking the Data Engineering Interview 2 | 3 | Cracking the Data Engineering Interview 4 | 5 | This is the code repository for [Cracking the Data Engineering Interview](https://www.packtpub.com/product/cracking-the-data-engineering-interview/9781837630776), published by Packt. 6 | 7 | **Land your dream job with the help of resume-building tips, over 100 mock questions, and a unique portfolio** 8 | 9 | ## What is this book about? 10 | Preparing for a data engineering interview can often get overwhelming due to the abundance of tools and technologies, leaving you struggling to prioritize which ones to focus on. This hands-on guide provides you with the essential foundational and advanced knowledge needed to simplify your learning journey. 11 | 12 | 13 | This book covers the following exciting features: 14 | * Create maintainable and scalable code for unit testing 15 | * Understand the fundamental concepts of core data engineering tasks 16 | * Prepare with over 100 behavioral and technical interview questions 17 | * Discover data engineer archetypes and how they can help you prepare for the interview 18 | * Apply the essential concepts of Python and SQL in data engineering 19 | * Build your personal brand to noticeably stand out as a candidate 20 | 21 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1837630771) today! 22 | 23 | https://www.packtpub.com/ 24 | 25 | ## Instructions and Navigations 26 | All of the code is organized into folders. 27 | 28 | The code will look like the following: 29 | ``` 30 | from scrape import * 31 | import pandas as pd 32 | from sqlalchemy import create_engine 33 | import psycopg2 34 | ``` 35 | 36 | **Following is what you need for this book:** 37 | If you’re an aspiring data engineer looking for guidance on how to land, prepare for, and excel in data engineering interviews, this book is for you. Familiarity with the fundamentals of data engineering, such as data modeling, cloud warehouses, programming (python and SQL), building data pipelines, scheduling your workflows (Airflow), and APIs, is a prerequisite. 38 | 39 | With the following software and hardware list you can run all code files present in the book (Chapter 1-16). 40 | 41 | ### Software and Hardware List 42 | 43 | | Chapter | Software required | OS required | 44 | | -------- | ------------------------------------------------------------------------------------| -----------------------------------| 45 | | 2 | Microsoft Azure | Windows, Mac OS X, and Linux (Any) | 46 | | 2 | Amazon Web Services | Windows, Mac OS X, and Linux (Any)| 47 | 48 | ### Related products 49 | * Data Wrangling with SQL [[Packt]](https://www.packtpub.com/product/data-wrangling-with-sql/9781837630028) [[Amazon]](https://www.amazon.com/dp/183763002X) 50 | 51 | * SQL Query Design Patterns and Best Practices [[Packt]](https://www.packtpub.com/product/sql-query-design-patterns-and-best-practices/9781837633289) [[Amazon]](https://www.amazon.com/dp/1837633282) 52 | 53 | ## Get to Know the Authors 54 | **Kedeisha Bryan** is a data professional with experience in data analytics, science, and engineering. She has prior experience combining both Six Sigma and analytics to provide data solutions that have impacted policy changes and leadership decisions. She is fluent in tools such as SQL, Python, and Tableau. 55 | She is the founder and leader at the Data in Motion Academy, providing personalized skill development, resources, and training at scale to aspiring data professionals across the globe. Her other works include another Packt book in the works and an SQL course for LinkedIn Learning. 56 | 57 | **Taamir Ransome** is a Data Scientist and Software Engineer. He has experience in building machine learning and artificial intelligence solutions for the US Army. He is also the founder of the Vet Dev Institute, where he currently provides cloud-based data solutions for clients. He holds a master’s degree in Analytics from Western Governors University. 58 | -------------------------------------------------------------------------------- /Chapter-2/scrape.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | import numpy as np 5 | 6 | def league_table(): 7 | 8 | url = 'https://www.bbc.com/sport/football/premier-league/table' 9 | headers = [] 10 | page = requests.get(url) 11 | soup = BeautifulSoup(page.text, "html.parser") 12 | table= soup.find("table", class_="ssrcss-14j0ip6-Table e3bga5w5") 13 | 14 | for i in table.find_all('th'): 15 | title = i.text 16 | headers.append(title) 17 | league_table = pd.DataFrame(columns = headers) 18 | for j in table.find_all('tr')[1:]: 19 | row_data = j.find_all('td') 20 | row = [i.text for i in row_data] 21 | length = len(league_table) 22 | league_table.loc[length] = row 23 | league_table.drop(["Form, Last 6 games, Oldest first"], axis=1, inplace=True) 24 | return league_table 25 | 26 | def top_scorers(): 27 | url = 'https://www.bbc.com/sport/football/premier-league/top-scorers' 28 | headers = [] 29 | page = requests.get(url) 30 | soup = BeautifulSoup(page.text, "html.parser") 31 | table= soup.find("table", class_="gs-o-table") 32 | 33 | for i in table.find_all('th'): 34 | title = i.text 35 | headers.append(title) 36 | top_scorers = pd.DataFrame(columns = headers) 37 | for j in table.find_all('tr')[1:]: 38 | row_data = j.find_all('td') 39 | row = [i.text for i in row_data] 40 | length = len(top_scorers) 41 | top_scorers.loc[length] = row 42 | 43 | top_scorers.Name = top_scorers.Name.replace(r'([A-Z])', r' \1', regex=True).str.split() 44 | top_scorers.Name = top_scorers.Name.apply(lambda x: ' '.join(dict.fromkeys(x).keys())) 45 | 46 | top_scorers['Club'] = top_scorers.Name.str.split().str[2:].str.join(' ') 47 | top_scorers.Name = top_scorers.Name.str.split().str[:2].str.join(' ') 48 | col = top_scorers.pop("Club") 49 | top_scorers.insert(2, 'Club', col) 50 | top_scorers.Club = top_scorers.Club.apply(lambda x: 'Manchester City' if 'Manchester City' in x else x) 51 | top_scorers.Club = top_scorers.Club.apply(lambda x: 'Manchester United' if 'Manchester United' in x else x) 52 | top_scorers.Club = top_scorers.Club.apply(lambda x: 'Brighton & Hove Albion' if 'Brighton & Hove Albion' in x else x) 53 | 54 | return top_scorers 55 | 56 | def detail_top(): 57 | url = 'https://www.worldfootball.net/goalgetter/eng-premier-league-2023-2024/' 58 | headers = [] 59 | page = requests.get(url) 60 | soup = BeautifulSoup(page.text, "html.parser") 61 | table= soup.find("table", class_="standard_tabelle") 62 | 63 | for i in table.find_all('th'): 64 | title = i.text 65 | headers.append(title) 66 | detail_top_scorer = pd.DataFrame(columns = headers) 67 | for j in table.find_all('tr')[1:]: 68 | row_data = j.find_all('td') 69 | row = [i.text for i in row_data] 70 | length = len(detail_top_scorer) 71 | detail_top_scorer.loc[length] = row 72 | 73 | detail_top_scorer = detail_top_scorer.drop([''],axis=1) 74 | detail_top_scorer.Team = detail_top_scorer.Team.str.replace('\n\n','') 75 | detail_top_scorer['Penalty'] = detail_top_scorer['Goals (Penalty)'].str.split().str[-1:].str.join(' ') 76 | detail_top_scorer['Penalty'] = detail_top_scorer['Penalty'].str.replace('(','') 77 | detail_top_scorer['Penalty'] = detail_top_scorer['Penalty'].str.replace(')','') 78 | detail_top_scorer['Goals (Penalty)'] = detail_top_scorer['Goals (Penalty)'].str.split().str[0].str.join('') 79 | detail_top_scorer.rename(columns = {'Goals (Penalty)':'Goals'}, inplace = True) 80 | detail_top_scorer = detail_top_scorer.drop(['#'], axis = 1) 81 | return detail_top_scorer 82 | 83 | def player_table(): 84 | url = [f'https://www.worldfootball.net/players_list/eng-premier-league-2023-2024/nach-name/{i:d}' for i in (range(1, 12))] 85 | header = ['Player','','Team','born','Height','Position'] 86 | df = pd.DataFrame(columns=header) 87 | def player(ev): 88 | url = ev 89 | headers = [] 90 | page = requests.get(url) 91 | soup = BeautifulSoup(page.text, "html.parser") 92 | table= soup.find("table", class_="standard_tabelle") 93 | 94 | for i in table.find_all('th'): 95 | title = i.text 96 | headers.append(title) 97 | players = pd.DataFrame(columns = headers) 98 | for j in table.find_all('tr')[1:]: 99 | row_data = j.find_all('td') 100 | row = [i.text for i in row_data] 101 | length = len(players) 102 | players.loc[length] = row 103 | return players 104 | 105 | for i in url: 106 | a = player(i) 107 | df = pd.concat([df, a], axis=0).reset_index(drop=True) 108 | 109 | df = df.drop([''], axis=1) 110 | return df 111 | 112 | def all_time_table(): 113 | url = 'https://www.worldfootball.net/alltime_table/eng-premier-league/pl-only/' 114 | headers = ['pos','#','Team','Matches','wins','Draws','Losses','Goals','Dif','Points'] 115 | page = requests.get(url) 116 | soup = BeautifulSoup(page.text, "html.parser") 117 | table= soup.find("table", class_="standard_tabelle") 118 | 119 | 120 | alltime_table= pd.DataFrame(columns = headers) 121 | for j in table.find_all('tr')[1:]: 122 | row_data = j.find_all('td') 123 | row = [i.text for i in row_data] 124 | length = len(alltime_table) 125 | alltime_table.loc[length] = row 126 | 127 | alltime_table = alltime_table.drop(['#'], axis=1) 128 | alltime_table.Team = alltime_table.Team.str.replace('\n', '') 129 | return alltime_table 130 | 131 | def all_time_winner_club(): 132 | url = 'https://www.worldfootball.net/winner/eng-premier-league/' 133 | headers = [] 134 | page = requests.get(url) 135 | soup = BeautifulSoup(page.text, "html.parser") 136 | table= soup.find("table", class_="standard_tabelle") 137 | 138 | for i in table.find_all('th'): 139 | title = i.text 140 | headers.append(title) 141 | winners = pd.DataFrame(columns = headers) 142 | for j in table.find_all('tr')[1:]: 143 | row_data = j.find_all('td') 144 | row = [i.text for i in row_data] 145 | length = len(winners) 146 | winners.loc[length] = row 147 | 148 | winners = winners.drop([''], axis=1) 149 | winners['Year'] = winners['Year'].str.replace('\n', '') 150 | return winners 151 | 152 | 153 | def top_scorers_seasons(): 154 | url = 'https://www.worldfootball.net/top_scorer/eng-premier-league/' 155 | headers = ['Season','#','Top scorer','#','Team','goals'] 156 | page = requests.get(url) 157 | soup = BeautifulSoup(page.text, "html.parser") 158 | table= soup.find("table", class_="standard_tabelle") 159 | winners = pd.DataFrame(columns = headers) 160 | for j in table.find_all('tr')[1:]: 161 | row_data = j.find_all('td') 162 | row = [i.text for i in row_data] 163 | length = len(winners) 164 | winners.loc[length] = row 165 | 166 | winners = winners.drop(['#'], axis=1) 167 | winners=winners.replace('\\n','',regex=True).astype(str) 168 | winners['Season'] = winners['Season'].replace('', np.nan).ffill() 169 | return winners 170 | 171 | def goals_per_season(): 172 | url = 'https://www.worldfootball.net/stats/eng-premier-league/1/' 173 | headers = [] 174 | page = requests.get(url) 175 | soup = BeautifulSoup(page.text, "html.parser") 176 | table= soup.find("table", class_="standard_tabelle") 177 | 178 | for i in table.find_all('th'): 179 | title = i.text 180 | headers.append(title) 181 | goals_per_season = pd.DataFrame(columns = headers) 182 | for j in table.find_all('tr')[1:]: 183 | row_data = j.find_all('td') 184 | row = [i.text for i in row_data] 185 | length = len(goals_per_season) 186 | goals_per_season.loc[length] = row 187 | goals_per_season.drop(goals_per_season.index[-1],inplace=True) 188 | 189 | goals_per_season = goals_per_season.drop(['#'], axis=1) 190 | goals_per_season.rename(columns = {'goals':'Goals','Ø goals':'Average Goals'}, inplace = True) 191 | 192 | return goals_per_season 193 | --------------------------------------------------------------------------------