├── __init__.py ├── .tool-versions ├── .gitignore ├── assets ├── cs.png ├── cs2.png └── repl.png ├── cleaning_functions.py ├── 4-data-quality-questions.py ├── tests ├── test_cleaning_functions-questions.py └── test_cleaning_functions_solutions.py ├── requirements.txt ├── 4-data-quality-solutions.py ├── README.md ├── 2-data-extract-load-questions.py ├── 3-data-transform-questions.py ├── 1-basics-questions.py ├── data ├── sample_data.csv └── customers.csv ├── setup_db.py ├── 2-data-extract-load-solutions.py ├── 1-basics-solutions.py └── 3-data-transform-solutions.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.tool-versions: -------------------------------------------------------------------------------- 1 | python 3.11.1 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | duckdb.db 2 | tpch.db 3 | myenv/ 4 | 5 | __pycache__/ 6 | -------------------------------------------------------------------------------- /assets/cs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/python_essentials_for_data_engineers/HEAD/assets/cs.png -------------------------------------------------------------------------------- /assets/cs2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/python_essentials_for_data_engineers/HEAD/assets/cs2.png -------------------------------------------------------------------------------- /assets/repl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephmachado/python_essentials_for_data_engineers/HEAD/assets/repl.png -------------------------------------------------------------------------------- /cleaning_functions.py: -------------------------------------------------------------------------------- 1 | # Simple function to remove duplicates 2 | 3 | 4 | def remove_duplicates(data, unique_key): 5 | data_unique = [] 6 | unique_key_set = set() 7 | 8 | for row in data: 9 | if row[unique_key] not in unique_key_set: 10 | data_unique.append(row) 11 | unique_key_set.add(row[unique_key]) 12 | else: 13 | print(f"duplicate customer id") 14 | 15 | return data_unique 16 | -------------------------------------------------------------------------------- /4-data-quality-questions.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from cuallee import Check, CheckLevel 3 | 4 | # Read CSV file into Polars DataFrame 5 | df = pl.read_csv("./data/sample_data.csv") 6 | 7 | # Question: Check for Nulls on column Id and that Customer_ID column is unique 8 | # check docs at https://canimus.github.io/cuallee/polars/ on how to define a check and run it. 9 | # you will end up with a dataframe of results, check that the `status` column does not have any "FAIL" in it 10 | 11 | -------------------------------------------------------------------------------- /tests/test_cleaning_functions-questions.py: -------------------------------------------------------------------------------- 1 | from cleaning_functions import remove_duplicates 2 | 3 | 4 | def test_remove_duplicates(): 5 | # Define sample data and unique_key 6 | data = [] 7 | unique_key = "" 8 | 9 | # Call the function 10 | unique_data = remove_duplicates(data, unique_key) 11 | 12 | # Assert that duplicates were removed 13 | # assert len(unique_data) == some number based on your input 14 | # Assert the actual values 15 | expected_data = [ 16 | ] 17 | assert unique_data == expected_data 18 | 19 | 20 | # Run this with the command python -m pytest ./tests 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | awswrangler==3.7.3 2 | black==24.4.2 3 | boto3==1.34.117 4 | botocore==1.34.117 5 | certifi==2024.6.2 6 | charset-normalizer==3.3.2 7 | click==8.1.7 8 | cuallee==0.10.3 9 | duckdb==1.0.0 10 | idna==3.7 11 | iniconfig==2.0.0 12 | isort==5.13.2 13 | jmespath==1.0.1 14 | mypy-extensions==1.0.0 15 | numpy==1.26.4 16 | packaging==24.0 17 | pandas==2.2.2 18 | pathspec==0.12.1 19 | platformdirs==4.2.2 20 | pluggy==1.5.0 21 | polars==0.20.31 22 | pyarrow==16.1.0 23 | pytest==8.2.1 24 | python-dateutil==2.9.0.post0 25 | pytz==2024.1 26 | requests==2.32.3 27 | s3transfer==0.10.1 28 | six==1.16.0 29 | toolz==0.12.1 30 | typing_extensions==4.12.1 31 | tzdata==2024.1 32 | urllib3==2.2.1 33 | -------------------------------------------------------------------------------- /4-data-quality-solutions.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from cuallee import Check, CheckLevel 3 | 4 | # Read CSV file into Polars DataFrame 5 | df = pl.read_csv("./data/sample_data.csv") 6 | 7 | # Question: Check for Nulls on column Id and that Customer_ID column is unique 8 | # check docs at https://canimus.github.io/cuallee/polars/ 9 | # you will end up with a dataframe of results, check that the `status` column does not have any "FAIL" in it 10 | 11 | check = Check(CheckLevel.ERROR, "Completeness") 12 | validation_results_df = ( 13 | check.is_complete("Customer_ID").is_unique("Customer_ID").validate(df) 14 | ) 15 | print(validation_results_df) 16 | 17 | results = validation_results_df["status"].to_list() 18 | assert "FAIL" not in results == True 19 | -------------------------------------------------------------------------------- /tests/test_cleaning_functions_solutions.py: -------------------------------------------------------------------------------- 1 | from cleaning_functions import remove_duplicates 2 | 3 | 4 | def test_remove_duplicates(): 5 | # Define sample data and unique key 6 | data = [ 7 | {"Customer_ID": 1, "Name": "Alice"}, 8 | {"Customer_ID": 2, "Name": "Bob"}, 9 | {"Customer_ID": 1, "Name": "Alice"}, # Duplicate 10 | {"Customer_ID": 3, "Name": "Charlie"}, 11 | {"Customer_ID": 2, "Name": "Bob"}, # Duplicate 12 | ] 13 | unique_key = "Customer_ID" 14 | 15 | # Call the function 16 | unique_data = remove_duplicates(data, unique_key) 17 | 18 | # Assert that duplicates were removed 19 | assert len(unique_data) == 3 20 | # Assert the actual values 21 | expected_data = [ 22 | {"Customer_ID": 1, "Name": "Alice"}, 23 | {"Customer_ID": 2, "Name": "Bob"}, 24 | {"Customer_ID": 3, "Name": "Charlie"}, 25 | ] 26 | assert unique_data == expected_data 27 | 28 | 29 | # Run this with the command python -m pytest ./tests 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | * [Python Essentials for Data Engineers](#python-essentials-for-data-engineers) 4 | * [Run on Codespaces](#run-on-codespaces) 5 | * [Running on your laptop](#running-on-your-laptop) 6 | * [Using python REPL](#using-python-repl) 7 | 8 | Code for Blog at: [Python Essentials for Data Engineers](https://www.startdataengineering.com/post/python-for-de/). 9 | 10 | # Python Essentials for Data Engineers 11 | 12 | ## Run on Codespaces 13 | 14 | Open codespaces and wait for codespaces to setup. The process of opening codespaces and waiting for completion is shown below. 15 | 16 | **NOTE**: Make sure to turn off codespaces, you only have limited free usage per month. 17 | 18 | ![Open codespace](./assets/cs.png) 19 | ![Wait for codespace to setup](./assets/cs2.png) 20 | 21 | ## Running on your laptop 22 | 23 | Clone the repo, cd into it and setup the virtual environment as shown below. 24 | 25 | ```bash 26 | git clone https://github.com/josephmachado/python_essentials_for_data_engineers.git 27 | cd python_essentials_for_data_engineers 28 | 29 | python -m venv myenv 30 | source myenv/bin/activate 31 | pip install -r requirements.txt 32 | 33 | # open python REPL with 34 | python 35 | ``` 36 | 37 | ## Using python REPL 38 | 39 | ![REPL](./assets/repl.png) 40 | 41 | In the Python REPL you can try out the commands and do the exercises. 42 | 43 | To run pytest (under ./tests folder) you will need to run the `python -m pytest ./tests` command. 44 | 45 | The questions are files with the prefix `-questions.py`, use these as starting points to practice python for data engineering. While the workbooks have solutions, there are multiple ways to do the same thing, and as long as you get the correct answer, you should be good. 46 | -------------------------------------------------------------------------------- /2-data-extract-load-questions.py: -------------------------------------------------------------------------------- 1 | # Extract: Process to pull data from Source system 2 | # Load: Process to write data to a destination system 3 | 4 | # Common upstream & downstream systems 5 | # OLTP Databases: Postgres, MySQL, sqlite3, etc 6 | # OLAP Databases: Snowflake, BigQuery, Clickhouse, DuckDB, etc 7 | # Cloud data storage: AWS S3, GCP Cloud Store, Minio, etc 8 | # Queue systems: Kafka, Redpanda, etc 9 | # API 10 | # Local disk: csv, excel, json, xml files 11 | # SFTP\FTP server 12 | 13 | # Databases: When reading or writing to a database we use a database driver. Database drivers are libraries that we can use to read or write to a database. 14 | # Question: How do you read data from a sqlite3 database and write to a DuckDB database? 15 | # Hint: Look at importing the database libraries for sqlite3 and duckdb and create connections to talk to the respective databases 16 | 17 | # Fetch data from the SQLite Customer table 18 | 19 | # Insert data into the DuckDB Customer table 20 | 21 | # Hint: Look for Commit and close the connections 22 | # Commit tells the DB connection to send the data to the database and commit it, if you don't commit the data will not be inserted 23 | 24 | # We should close the connection, as DB connections are expensive 25 | 26 | # Cloud storage 27 | # Question: How do you read data from the S3 location given below and write the data to a DuckDB database? 28 | # Data source: https://docs.opendata.aws/noaa-ghcn-pds/readme.html station data at path "csv.gz/by_station/ASN00002022.csv.gz" 29 | # Hint: Use boto3 client with UNSIGNED config to access the S3 bucket 30 | # Hint: The data will be zipped you have to unzip it and decode it to utf-8 31 | 32 | # AWS S3 bucket and file details 33 | bucket_name = "noaa-ghcn-pds" 34 | file_key = "csv.gz/by_station/ASN00002022.csv.gz" 35 | # Create a boto3 client with anonymous access 36 | 37 | # Download the CSV file from S3 38 | # Decompress the gzip data 39 | # Read the CSV file using csv.reader 40 | # Connect to the DuckDB database (assume WeatherData table exists) 41 | 42 | # Insert data into the DuckDB WeatherData table 43 | 44 | # API 45 | # Question: How do you read data from the CoinCap API given below and write the data to a DuckDB database? 46 | # URL: "https://api.coincap.io/v2/exchanges" 47 | # Hint: use requests library 48 | 49 | # Define the API endpoint 50 | url = "https://api.coincap.io/v2/exchanges" 51 | 52 | # Fetch data from the CoinCap API 53 | # Connect to the DuckDB database 54 | 55 | # Insert data into the DuckDB Exchanges table 56 | # Prepare data for insertion 57 | # Hint: Ensure that the data types of the data to be inserted is compatible with DuckDBs data column types in ./setup_db.py 58 | 59 | 60 | # Local disk 61 | # Question: How do you read a CSV file from local disk and write it to a database? 62 | # Look up open function with csvreader for python 63 | 64 | # Web scraping 65 | # Questions: Use beatiful soup to scrape the below website and print all the links in that website 66 | # URL of the website to scrape 67 | url = 'https://example.com' 68 | -------------------------------------------------------------------------------- /3-data-transform-questions.py: -------------------------------------------------------------------------------- 1 | print( 2 | "################################################################################" 3 | ) 4 | print("Use standard python libraries to do the transformations") 5 | print( 6 | "################################################################################" 7 | ) 8 | 9 | # Question: How do you read data from a CSV file at ./data/sample_data.csv into a list of dictionaries? 10 | 11 | # Question: How do you remove duplicate rows based on customer ID? 12 | 13 | # Question: How do you handle missing values by replacing them with 0? 14 | 15 | # Question: How do you remove outliers such as age > 100 or purchase amount > 1000? 16 | 17 | # Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male)? 18 | 19 | # Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns? 20 | 21 | # Question: How do you calculate the total purchase amount by Gender? 22 | 23 | # Question: How do you calculate the average purchase amount by Age group? 24 | # assume age_groups is the grouping we want 25 | # hint: Why do we convert to float? 26 | age_groups = {"18-30": [], "31-40": [], "41-50": [], "51-60": [], "61-70": []} 27 | 28 | # Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group? 29 | your_total_purchase_amount_by_gender = {} # your results should be assigned to this variable 30 | average_purchase_by_age_group = {} # your results should be assigned to this variable 31 | 32 | print(f"Total purchase amount by Gender: {your_total_purchase_amount_by_gender}") 33 | print(f"Average purchase amount by Age group: {average_purchase_by_age_group}") 34 | 35 | print( 36 | "################################################################################" 37 | ) 38 | print("Use DuckDB to do the transformations") 39 | print( 40 | "################################################################################" 41 | ) 42 | 43 | # Question: How do you connect to DuckDB and load data from a CSV file into a DuckDB table? 44 | # Connect to DuckDB and load data 45 | 46 | # Read data from CSV file into DuckDB table 47 | 48 | # Question: How do you remove duplicate rows based on customer ID in DuckDB? 49 | 50 | # Question: How do you handle missing values by replacing them with 0 in DuckDB? 51 | 52 | # Question: How do you remove outliers (e.g., age > 100 or purchase amount > 1000) in DuckDB? 53 | 54 | # Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male) in DuckDB? 55 | 56 | # Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns in DuckDB? 57 | 58 | # Question: How do you calculate the total purchase amount by Gender in DuckDB? 59 | 60 | # Question: How do you calculate the average purchase amount by Age group in DuckDB? 61 | 62 | # Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group in DuckDB? 63 | print("====================== Results ======================") 64 | print("Total purchase amount by Gender:") 65 | print("Average purchase amount by Age group:") 66 | -------------------------------------------------------------------------------- /1-basics-questions.py: -------------------------------------------------------------------------------- 1 | 2 | # Variable: A storage location identified by its name, containing some value. 3 | # Question: Assign a value of 10 to variable a and 20 to variable b 4 | # Question: Store the result of a + b in a variable c and print it. What is the result of a + b? 5 | 6 | s = ' Some string ' 7 | # Question: How do you remove the empty spaces in front of and behind the string s? 8 | print(s.strip()) 9 | 10 | # Data Structures are ways of representing data, each has its own pros and cons and places that they are the right fit. 11 | ## List: A collection of elements that can be accessed by knowing the location (aka index) of the element 12 | l = [1, 2, 3, 4] 13 | 14 | # Question: How do you access the elements in index 0 and 3? Print the results. 15 | ## NOTE: lists retain the order of elements in it but dictionary doesn't 16 | 17 | ## Dictionary: A collection of key-value pairs, where each key is mapped to a value using a hash function. Provides fast data retrieval based on keys. 18 | d = {'a': 1, 'b': 2} 19 | 20 | # Question: How do you access the values associated with keys 'a' and 'b'? 21 | ## NOTE: The dictionary cannot have duplicate keys 22 | 23 | ## Set: A collection of unique elements that do not allow duplicates 24 | my_set = set() 25 | my_set.add(10) 26 | my_set.add(10) 27 | my_set.add(10) 28 | 29 | # Question: What will be the output of my_set? 30 | 31 | ## Tuple: A collection of immutable (non-changeable) elements, tuples retain their order once created. 32 | my_tuple = (1, 'hello', 3.14) 33 | 34 | # Question: What is the value of my_tuple? 35 | 36 | # Accessing elements by index 37 | 38 | # Question: How do you access the elements in index 0 and 1 of my_tuple? 39 | 40 | # Counting occurrences of an element 41 | count_tuple = (1, 2, 3, 1, 1, 2) 42 | 43 | # Question: How many times does the number 1 appear in count_tuple? 44 | 45 | # Finding the index of an element 46 | # Question: What is the index of the first occurrence of the number 2 in count_tuple? 47 | 48 | # Loop allows a specific chunk of code to be repeated a certain number of times 49 | # Example: We can use a loop to print numbers 0 through 10 50 | for i in range(11): 51 | print(i) 52 | 53 | # We can loop through our data structures as shown below 54 | # Question: How do you loop through a list and print its elements? 55 | 56 | # Dictionary loop 57 | # Question: How do you loop through a dictionary and print its keys and values? 58 | 59 | # Comprehension is a shorthand way of writing a loop 60 | # Question: Multiply every element in list l with 2 and print the result 61 | 62 | # Functions: A block of code that can be re-used as needed. This allows for us to have logic defined in one place, making it easy to maintain and use. 63 | ## For example, let's create a simple function that takes a list as an input and returns another list whose values are greater than 3 64 | 65 | def gt_three(input_list): 66 | return [elt for elt in input_list if elt > 3] 67 | ## NOTE: we use list comprehension with filtering in the above function 68 | 69 | list_1 = [1, 2, 3, 4, 5, 6] 70 | # Question: How do you use the gt_three function to filter elements greater than 3 from list_1? 71 | 72 | list_2 = [1, 2, 3, 1, 1, 1] 73 | # Question: What will be the output of gt_three(list_2)? 74 | 75 | # Classes and Objects 76 | # Think of a class as a blueprint and objects as things created based on that blueprint 77 | # You can define classes in Python as shown below 78 | class DataExtractor: 79 | 80 | def __init__(self, some_value): 81 | self.some_value = some_value 82 | 83 | def get_connection(self): 84 | # Some logic 85 | # some_value is accessible using self.some_value 86 | pass 87 | 88 | def close_connection(self): 89 | # Some logic 90 | # some_value is accessible using self.some_value 91 | pass 92 | 93 | # Question: How do you create a DataExtractor object and print its some_value attribute? 94 | 95 | # Libraries are code that can be reused. 96 | 97 | # Python comes with some standard libraries to do common operations, 98 | # such as the datetime library to work with time (although there are better libraries) 99 | from datetime import datetime # You can import library or your code from another file with the import statement 100 | 101 | # Question: How do you print the current date in the format 'YYYY MM DD'? Hint: Google strftime 102 | 103 | # Exception handling: When an error occurs, we need our code to gracefully handle it without just stopping. 104 | # Here is how we can handle errors when the program is running 105 | try: 106 | # Code that might raise an exception 107 | pass 108 | except Exception as e: 109 | # Code that runs if the exception occurs 110 | pass 111 | else: 112 | # Code that runs if no exception occurs 113 | pass 114 | finally: 115 | # Code that always runs, regardless of exceptions 116 | pass 117 | 118 | # For example, let's consider exception handling on accessing an element that is not present in a list l 119 | l = [1, 2, 3, 4, 5] 120 | 121 | # Question: How do you handle an IndexError when accessing an invalid index in a list? 122 | # NOTE: in the except block its preferred to specify the exact erro/exception that you want to handle 123 | -------------------------------------------------------------------------------- /data/sample_data.csv: -------------------------------------------------------------------------------- 1 | Customer_ID,Customer_Name,Age,Gender,Purchase_Amount,Purchase_Date 2 | 1,Henry Jones,32,Male,1080000.66,2023-08-15 3 | 2,Emma Rodriguez,24,Male,62.4,2024-04-16 4 | 3,Frank Martinez,20,Female,443.47,2024-05-16 5 | 4,Alice Rodriguez,62,Female,729.69,2024-01-05 6 | 5,Frank Miller,33,Female,651.2,2024-05-23 7 | 6,Emma Garcia,22,Female,477.56,2023-09-02 8 | 7,Grace Jones,53,Female,29.72,2023-10-01 9 | 8,Henry Smith,41,Male,291.28,2024-02-01 10 | 9,Emma Jones,50,Female,575.69,2023-12-11 11 | 10,Ivy Martinez,44,Female,30.96,2024-05-11 12 | 11,Grace Miller,29,Male,321.42,2023-10-25 13 | 12,Ivy Martinez,68,Male,470.45,2023-06-05 14 | 13,Charlie Jones,24,Female,330.87,2023-11-11 15 | 14,Frank Rodriguez,51,Female,483.61,2024-05-06 16 | 15,Alice Jones,53,Male,647.13,2023-12-07 17 | 16,Grace Miller,36,Female,691.85,2023-09-25 18 | 17,Charlie Miller,27,Male,840.67,2023-07-09 19 | 18,Jack Martinez,64,Female,831.8,2023-11-07 20 | 19,Emma Martinez,64,Female,399.76,2024-02-04 21 | 20,Grace Miller,56,Male,917.93,2023-12-24 22 | 21,Grace Rodriguez,21,Female,389.69,2023-07-19 23 | 22,Ivy Brown,34,Male,697.0,2023-07-15 24 | 23,Jack Miller,55,Male,838.01,2023-10-22 25 | 24,Ivy Martinez,42,Male,620.28,2024-01-19 26 | 25,David Johnson,41,Female,964.04,2023-12-30 27 | 26,David Smith,36,Female,231.88,2024-05-27 28 | 27,Ivy Martinez,22,Female,115.68,2023-08-24 29 | 28,Henry Martinez,62,Male,735.66,2023-12-07 30 | 29,Charlie Garcia,42,Female,696.19,2023-07-26 31 | 30,Grace Davis,30,Female,566.13,2023-07-18 32 | 31,Grace Smith,19,Male,567.3,2024-04-13 33 | 32,Emma Jones,58,Male,299.82,2024-03-21 34 | 33,Grace Rodriguez,43,Male,366.42,2023-07-30 35 | 34,Alice Garcia,54,Male,874.08,2024-04-17 36 | 35,Grace Williams,58,Male,139.01,2024-04-23 37 | 36,Frank Johnson,54,Female,637.05,2023-08-06 38 | 37,Ivy Miller,37,Male,787.06,2024-05-04 39 | 38,Charlie Rodriguez,52,Female,853.47,2023-11-26 40 | 39,Grace Garcia,23,Male,901.82,2023-12-09 41 | 40,Emma Martinez,61,Male,180.92,2023-10-02 42 | 41,Alice Jones,23,Male,373.03,2024-03-16 43 | 42,Charlie Smith,66,Male,392.27,2023-11-03 44 | 43,Bob Williams,35,Female,319.58,2024-03-29 45 | 44,Frank Garcia,24,Male,917.63,2023-07-26 46 | 45,Frank Martinez,38,Male,313.45,2024-01-14 47 | 46,Ivy Brown,47,Female,349.88,2023-07-05 48 | 47,Grace Rodriguez,23,Female,933.63,2024-03-18 49 | 48,Alice Smith,23,Female,762.86,2024-03-04 50 | 49,Alice Brown,29,Female,920.49,2024-04-30 51 | 50,Bob Smith,38,Female,104.47,2023-12-31 52 | 51,Frank Brown,46,Male,621.47,2023-12-30 53 | 52,Frank Johnson,54,Female,281.35,2023-07-02 54 | 53,Alice Miller,31,Female,396.19,2023-09-17 55 | 54,Emma Martinez,39,Female,778.26,2024-05-08 56 | 55,Grace Brown,50,Male,388.92,2023-12-20 57 | 56,Henry Miller,56,Male,85.85,2024-02-26 58 | 57,Charlie Martinez,40,Female,268.41,2023-08-29 59 | 58,Alice Jones,50,Male,397.89,2023-10-04 60 | 59,Grace Martinez,37,Male,588.02,2024-03-13 61 | 60,David Davis,42,Male,698.82,2023-11-21 62 | 61,Bob Garcia,18,Male,194.19,2023-12-23 63 | 62,Alice Davis,50,Male,376.8,2024-03-20 64 | 63,David Rodriguez,48,Male,447.09,2023-09-05 65 | 64,Henry Brown,35,Male,823.0,2023-10-08 66 | 65,Bob Garcia,32,Female,726.87,2024-01-06 67 | 66,Alice Davis,68,Male,377.11,2024-02-22 68 | 67,Frank Johnson,60,Female,335.99,2023-10-03 69 | 68,Frank Miller,34,Female,873.36,2024-01-27 70 | 69,Bob Martinez,50,Female,574.54,2024-05-09 71 | 70,Bob Williams,48,Female,711.75,2024-04-11 72 | 71,Henry Williams,41,Female,323.66,2023-08-20 73 | 72,Frank Miller,66,Female,932.36,2024-02-19 74 | 73,Jack Rodriguez,70,Male,114.88,2023-06-21 75 | 74,Emma Johnson,21,Female,529.71,2023-08-11 76 | 75,Ivy Williams,24,Male,779.41,2023-12-04 77 | 76,David Rodriguez,23,Male,212.29,2024-03-16 78 | 77,Frank Rodriguez,69,Female,586.67,2024-04-25 79 | 78,Charlie Miller,24,Male,510.7,2024-02-04 80 | 79,Bob Brown,27,Female,761.49,2024-05-20 81 | 80,Charlie Jones,38,Female,282.35,2024-04-14 82 | 81,Bob Rodriguez,54,Female,439.0,2024-03-09 83 | 82,Jack Miller,64,Female,293.95,2023-10-25 84 | 83,Alice Martinez,41,Female,185.83,2024-02-20 85 | 84,David Jones,25,Male,578.27,2023-07-14 86 | 85,Emma Jones,69,Female,480.21,2024-05-17 87 | 86,Bob Garcia,34,Male,906.17,2024-01-27 88 | 87,Henry Johnson,27,Female,586.82,2023-08-02 89 | 88,Jack Rodriguez,30,Male,840.43,2023-07-27 90 | 89,Alice Smith,19,Female,602.63,2024-04-01 91 | 90,Ivy Davis,55,Female,405.7,2024-04-10 92 | 91,Alice Jones,64,Female,960.03,2023-12-26 93 | 92,Ivy Brown,19,Female,676.08,2023-12-18 94 | 93,Bob Rodriguez,26,Male,891.46,2024-01-22 95 | 94,Ivy Brown,56,Male,489.22,2024-05-03 96 | 95,Jack Brown,51,Female,649.88,2024-05-13 97 | 96,Grace Miller,42,Male,461.16,2023-12-03 98 | 97,Ivy Brown,21,Male,465.45,2023-09-06 99 | 98,Grace Davis,42,Male,796.25,2023-07-07 100 | 84,David Jones,25,Male,578.27,2023-07-14 101 | 85,Emma Jones,69,Female,480.21,2024-05-17 102 | 86,Bob Garcia,34,Male,906.17,2024-01-27 103 | 87,Henry Johnson,27,Female,586.82,2023-08-02 104 | 88,Jack Rodriguez,30,Male,840.43,2023-07-27 105 | 89,Alice Smith,19,Female,602.63,2024-04-01 106 | 90,Ivy Davis,55,Female,405.7,2024-04-10 107 | 91,Alice Jones,64,Female,960.03,2023-12-26 108 | 99,Alice Johnson,,Female,781.83,2023-06-04 109 | 100,Jack Garcia,,Female,269.64,2024-03-08 110 | -------------------------------------------------------------------------------- /setup_db.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import sqlite3 4 | 5 | 6 | def del_existing_db(db_path): 7 | # Delete file if it exists 8 | if os.path.exists(db_path): 9 | os.remove(db_path) 10 | print(f"{db_path} has been deleted.") 11 | else: 12 | print(f"{db_path} does not exist.") 13 | 14 | 15 | del_existing_db("tpch.db") 16 | del_existing_db("duckdb.db") 17 | 18 | # Connect to SQLite database (or create it if it doesn't exist) 19 | conn = sqlite3.connect("tpch.db") 20 | cursor = conn.cursor() 21 | 22 | # Create Customer table 23 | cursor.execute("DROP TABLE IF EXISTS Customer") 24 | cursor.execute( 25 | """ 26 | CREATE TABLE IF NOT EXISTS Customer ( 27 | customer_id INTEGER PRIMARY KEY, 28 | zipcode TEXT, 29 | city TEXT, 30 | state_code TEXT, 31 | datetime_created TEXT, 32 | datetime_updated TEXT 33 | ) 34 | """ 35 | ) 36 | 37 | 38 | # Function to read CSV and insert data into the table 39 | def insert_data_from_csv(csv_file): 40 | with open(csv_file, "r") as file: 41 | reader = csv.DictReader(file) 42 | for row in reader: 43 | cursor.execute( 44 | """ 45 | INSERT INTO Customer (customer_id, zipcode, city, state_code, datetime_created, datetime_updated) 46 | VALUES (?, ?, ?, ?, ?, ?) 47 | """, 48 | ( 49 | row["customer_id"], 50 | row["zipcode"], 51 | row["city"], 52 | row["state_code"], 53 | row["datetime_created"], 54 | row["datetime_updated"], 55 | ), 56 | ) 57 | conn.commit() 58 | 59 | 60 | # Insert data from CSV file 61 | insert_data_from_csv("./data/customers.csv") 62 | 63 | # Close the database connection 64 | conn.close() 65 | 66 | print("Data inserted successfully!") 67 | 68 | 69 | import duckdb 70 | 71 | # Connect to the DuckDB database (or create it if it doesn't exist) 72 | duckdb_conn = duckdb.connect("duckdb.db") 73 | 74 | # Create the Customer table in DuckDB 75 | duckdb_conn.execute("DROP TABLE IF EXISTS Customer") 76 | duckdb_conn.execute( 77 | """ 78 | CREATE TABLE IF NOT EXISTS Customer ( 79 | customer_id INTEGER, 80 | zipcode TEXT, 81 | city TEXT, 82 | state_code TEXT, 83 | datetime_created TIMESTAMP, 84 | datetime_updated TIMESTAMP 85 | ) 86 | """ 87 | ) 88 | 89 | duckdb_conn.execute("DROP TABLE IF EXISTS WeatherData") 90 | duckdb_conn.execute( 91 | """ 92 | CREATE TABLE IF NOT EXISTS WeatherData ( 93 | id TEXT, 94 | date TEXT, 95 | element TEXT, 96 | value INTEGER, 97 | m_flag TEXT, 98 | q_flag TEXT, 99 | s_flag TEXT, 100 | obs_time TEXT 101 | ) 102 | """ 103 | ) 104 | 105 | duckdb_conn.execute("DROP TABLE IF EXISTS Exchanges") 106 | duckdb_conn.execute( 107 | """ 108 | CREATE TABLE IF NOT EXISTS Exchanges ( 109 | id TEXT, 110 | name TEXT, 111 | rank INTEGER, 112 | percentTotalVolume FLOAT, 113 | volumeUsd FLOAT, 114 | tradingPairs TEXT, 115 | socket BOOLEAN, 116 | exchangeUrl TEXT, 117 | updated BIGINT 118 | ) 119 | """ 120 | ) 121 | 122 | # Commit and close the connection 123 | duckdb_conn.commit() 124 | duckdb_conn.close() 125 | 126 | print("Customer table created successfully!") 127 | 128 | import csv 129 | import datetime 130 | import random 131 | 132 | 133 | # Generate random customer names 134 | def generate_name(): 135 | first_names = [ 136 | "Alice", 137 | "Bob", 138 | "Charlie", 139 | "David", 140 | "Emma", 141 | "Frank", 142 | "Grace", 143 | "Henry", 144 | "Ivy", 145 | "Jack", 146 | ] 147 | last_names = [ 148 | "Smith", 149 | "Johnson", 150 | "Williams", 151 | "Brown", 152 | "Jones", 153 | "Garcia", 154 | "Miller", 155 | "Davis", 156 | "Rodriguez", 157 | "Martinez", 158 | ] 159 | return random.choice(first_names) + " " + random.choice(last_names) 160 | 161 | 162 | # Generate random age between 18 and 70 163 | def generate_age(): 164 | return random.randint(18, 70) 165 | 166 | 167 | # Generate random gender 168 | def generate_gender(): 169 | return random.choice(["Male", "Female"]) 170 | 171 | 172 | # Generate random purchase amount between 10 and 1000 173 | def generate_purchase_amount(): 174 | return round(random.uniform(10, 1000), 2) 175 | 176 | 177 | # Generate random purchase date within the last year 178 | def generate_purchase_date(): 179 | end_date = datetime.date.today() 180 | start_date = end_date - datetime.timedelta(days=365) 181 | random_date = start_date + datetime.timedelta(days=random.randint(0, 365)) 182 | return random_date.strftime("%Y-%m-%d") 183 | 184 | 185 | # Generate CSV data 186 | with open("./data/sample_data.csv", "w", newline="") as csvfile: 187 | fieldnames = [ 188 | "Customer_ID", 189 | "Customer_Name", 190 | "Age", 191 | "Gender", 192 | "Purchase_Amount", 193 | "Purchase_Date", 194 | ] 195 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 196 | 197 | writer.writeheader() 198 | for i in range(100): 199 | writer.writerow( 200 | { 201 | "Customer_ID": i + 1, 202 | "Customer_Name": generate_name(), 203 | "Age": generate_age(), 204 | "Gender": generate_gender(), 205 | "Purchase_Amount": generate_purchase_amount(), 206 | "Purchase_Date": generate_purchase_date(), 207 | } 208 | ) 209 | 210 | print("CSV file generated successfully!") 211 | -------------------------------------------------------------------------------- /2-data-extract-load-solutions.py: -------------------------------------------------------------------------------- 1 | # Extract: Process to pull data from Source system 2 | # Load: Process to write data to a destination system 3 | 4 | # Common upstream & downstream systems 5 | # OLTP Databases: Postgres, MySQL, sqlite3, etc 6 | # OLAP Databases: Snowflake, BigQuery, Clickhouse, DuckDB, etc 7 | # Cloud data storage: AWS S3, GCP Cloud Store, Minio, etc 8 | # Queue systems: Kafka, Redpanda, etc 9 | # API 10 | # Local disk: csv, excel, json, xml files 11 | # SFTP\FTP server 12 | 13 | # Databases: When reading or writing to a database we use a database driver. Database drivers are libraries that we can use to read or write to a database. 14 | # Question: How do you read data from a sqlite3 database and write to a DuckDB database? 15 | import sqlite3 # we import the sqlite3 database driver 16 | 17 | # Connect to the SQLite database 18 | sqlite_conn = sqlite3.connect( 19 | "tpch.db" 20 | ) # Typically this will involve a connection string, sqlite3 db is stored as a file 21 | 22 | # Fetch data from the SQLite Customer table using conn.execute 23 | customers = sqlite_conn.execute( 24 | "SELECT * FROM Customer" 25 | ).fetchall() # Fetch data from the SQLite Customer table 26 | 27 | import duckdb # duckdb database driver 28 | 29 | duckdb_conn = duckdb.connect("duckdb.db") # Duckdb connection string 30 | # Insert data into the DuckDB Customer table 31 | insert_query = f""" 32 | INSERT INTO Customer (customer_id, zipcode, city, state_code, datetime_created, datetime_updated) 33 | VALUES (?, ?, ?, ?, ?, ?) 34 | """ # Insert into query 35 | 36 | duckdb_conn.executemany(insert_query, customers) 37 | 38 | # Commit and close the connections 39 | # Commit tells the DB connection to send the data to the database and commit it, if you don't commit the data will not be inserted 40 | duckdb_conn.commit() 41 | 42 | # We should close the connection, as DB connections are expensive 43 | sqlite_conn.close() 44 | duckdb_conn.close() 45 | 46 | # Cloud storage 47 | # Question: How do you read data from the S3 location given below and write the data to a DuckDB database? 48 | # Data source: https://docs.opendata.aws/noaa-ghcn-pds/readme.html station data at path "csv.gz/by_station/ASN00002022.csv.gz" 49 | # Hint: Use boto3 client with UNSIGNED config to access the S3 bucket 50 | # Hint: The data will be zipped you have to unzip it 51 | 52 | import csv 53 | import gzip 54 | from io import StringIO 55 | 56 | import boto3 57 | import duckdb 58 | from botocore import UNSIGNED 59 | from botocore.client import Config 60 | 61 | # AWS S3 bucket and file details 62 | bucket_name = "noaa-ghcn-pds" 63 | file_key = "csv.gz/by_station/ASN00002022.csv.gz" 64 | # Create a boto3 client with anonymous access 65 | s3_client = boto3.client("s3", config=Config(signature_version=UNSIGNED)) 66 | 67 | # Download the CSV file from S3 68 | response = s3_client.get_object(Bucket=bucket_name, Key=file_key) 69 | compressed_data = response["Body"].read() 70 | 71 | # Decompress the gzip data 72 | csv_data = gzip.decompress(compressed_data).decode("utf-8") 73 | 74 | # Read the CSV file using csv.reader 75 | csv_reader = csv.reader(StringIO(csv_data)) 76 | data = list(csv_reader) 77 | # Connect to the DuckDB database (assume WeatherData table exists) 78 | duckdb_conn = duckdb.connect("duckdb.db") 79 | 80 | # Insert data into the DuckDB WeatherData table 81 | insert_query = """ 82 | INSERT INTO WeatherData (id, date, element, value, m_flag, q_flag, s_flag, obs_time) 83 | VALUES (?, ?, ?, ?, ?, ?, ?, ?) 84 | """ 85 | 86 | duckdb_conn.executemany(insert_query, data[:100000]) 87 | 88 | # Commit and close the connection 89 | duckdb_conn.commit() 90 | duckdb_conn.close() 91 | 92 | # API 93 | # Question: How do you read data from the CoinCap API given below and write the data to a DuckDB database? 94 | # URL: "https://api.coincap.io/v2/exchanges" 95 | # Hint: use requests library 96 | 97 | import duckdb 98 | import requests 99 | 100 | # Define the API endpoint 101 | url = "https://api.coincap.io/v2/exchanges" 102 | 103 | # Fetch data from the CoinCap API 104 | response = requests.get(url) 105 | data = response.json()["data"] 106 | 107 | # Connect to the DuckDB database 108 | duckdb_conn = duckdb.connect("duckdb.db") 109 | 110 | # Insert data into the DuckDB Exchanges table 111 | insert_query = """ 112 | INSERT INTO Exchanges (id, name, rank, percentTotalVolume, volumeUsd, tradingPairs, socket, exchangeUrl, updated) 113 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) 114 | """ 115 | # Prepare data for insertion 116 | # Hint: Why are we changing the data type? 117 | insert_data = [ 118 | ( 119 | exchange["exchangeId"], 120 | exchange["name"], 121 | int(exchange["rank"]), 122 | ( 123 | float(exchange["percentTotalVolume"]) 124 | if exchange["percentTotalVolume"] 125 | else None 126 | ), 127 | float(exchange["volumeUsd"]) if exchange["volumeUsd"] else None, 128 | exchange["tradingPairs"], 129 | exchange["socket"], 130 | exchange["exchangeUrl"], 131 | int(exchange["updated"]), 132 | ) 133 | for exchange in data 134 | ] 135 | 136 | duckdb_conn.executemany(insert_query, insert_data) 137 | 138 | # Commit and close the connection 139 | duckdb_conn.commit() 140 | duckdb_conn.close() 141 | 142 | # Local disk 143 | # Question: How do you read a CSV file from local disk and write it to a database? 144 | # Look up open function with csvreader for python 145 | 146 | import csv 147 | 148 | data_location = "./data/customers.csv" 149 | with open(data_location, "r", newline="") as csvfile: 150 | csvreader = csv.reader(csvfile) 151 | next(csvreader) # Skip header row 152 | for row in csvreader: 153 | print(row) 154 | 155 | # Web scraping 156 | # Questions: Use beatiful soup to scrape the below website and print all the links in that website 157 | # URL of the website to scrape 158 | 159 | import requests 160 | from bs4 import BeautifulSoup 161 | 162 | # URL of the website to scrape 163 | url = 'https://example.com' 164 | 165 | # Send a GET request to the website 166 | response = requests.get(url) 167 | 168 | # Parse the HTML content of the webpage 169 | soup = BeautifulSoup(response.text, 'html.parser') 170 | 171 | # Example: Find and print all the links on the webpage 172 | for link in soup.find_all('a'): 173 | print(link.get('href')) 174 | 175 | -------------------------------------------------------------------------------- /data/customers.csv: -------------------------------------------------------------------------------- 1 | customer_id,zipcode,city,state_code,datetime_created,datetime_updated 2 | 1,14409,franca,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 3 | 2,09790,sao bernardo do campo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 4 | 3,01151,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 5 | 4,08775,mogi das cruzes,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 6 | 5,13056,campinas,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 7 | 6,89254,jaragua do sul,SC,2017-10-18 00:00:00,2017-10-18 00:00:00 8 | 7,04534,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 9 | 8,35182,timoteo,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 10 | 9,81560,curitiba,PR,2017-10-18 00:00:00,2017-10-18 00:00:00 11 | 10,30575,belo horizonte,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 12 | 11,39400,montes claros,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 13 | 12,20231,rio de janeiro,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 14 | 13,18682,lencois paulista,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 15 | 14,05704,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 16 | 15,95110,caxias do sul,RS,2017-10-18 00:00:00,2017-10-18 00:00:00 17 | 16,13412,piracicaba,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 18 | 17,22750,rio de janeiro,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 19 | 18,07124,guarulhos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 20 | 19,05416,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 21 | 20,68485,pacaja,PA,2017-10-18 00:00:00,2017-10-18 00:00:00 22 | 21,88034,florianopolis,SC,2017-10-18 00:00:00,2017-10-18 00:00:00 23 | 22,74914,aparecida de goiania,GO,2017-10-18 00:00:00,2017-10-18 00:00:00 24 | 23,05713,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 25 | 24,82820,curitiba,PR,2017-10-18 00:00:00,2017-10-18 00:00:00 26 | 25,08225,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 27 | 26,09121,santo andre,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 28 | 27,74310,goiania,GO,2017-10-18 00:00:00,2017-10-18 00:00:00 29 | 28,04571,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 30 | 29,29311,cachoeiro de itapemirim,ES,2017-10-18 00:00:00,2017-10-18 00:00:00 31 | 30,05528,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 32 | 31,12235,sao jose dos campos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 33 | 32,18130,sao roque,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 34 | 33,42800,camacari,BA,2017-10-18 00:00:00,2017-10-18 00:00:00 35 | 34,27525,resende,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 36 | 35,81750,curitiba,PR,2017-10-18 00:00:00,2017-10-18 00:00:00 37 | 36,13175,sumare,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 38 | 37,07170,guarulhos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 39 | 38,93415,novo hamburgo,RS,2017-10-18 00:00:00,2017-10-18 00:00:00 40 | 39,65075,sao luis,MA,2017-10-18 00:00:00,2017-10-18 00:00:00 41 | 40,88104,sao jose,SC,2017-10-18 00:00:00,2017-10-18 00:00:00 42 | 41,07176,guarulhos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 43 | 42,35960,santa barbara,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 44 | 43,05727,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 45 | 44,07053,guarulhos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 46 | 45,14026,ribeirao preto,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 47 | 46,30320,belo horizonte,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 48 | 47,38300,ituiutaba,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 49 | 48,18740,taquarituba,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 50 | 49,83085,sao jose dos pinhais,PR,2017-10-18 00:00:00,2017-10-18 00:00:00 51 | 50,89254,jaragua do sul,SC,2017-10-18 00:00:00,2017-10-18 00:00:00 52 | 51,05351,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 53 | 52,39406,montes claros,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 54 | 53,14860,barrinha,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 55 | 54,21310,rio de janeiro,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 56 | 55,23970,parati,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 57 | 56,79804,dourados,MS,2017-10-18 00:00:00,2017-10-18 00:00:00 58 | 57,05017,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 59 | 58,75388,trindade,GO,2017-10-18 00:00:00,2017-10-18 00:00:00 60 | 59,85808,cascavel,PR,2017-10-18 00:00:00,2017-10-18 00:00:00 61 | 60,60140,fortaleza,CE,2017-10-18 00:00:00,2017-10-18 00:00:00 62 | 61,72270,brasilia,DF,2017-10-18 00:00:00,2017-10-18 00:00:00 63 | 62,02075,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 64 | 63,96015,pelotas,RS,2017-10-18 00:00:00,2017-10-18 00:00:00 65 | 64,90010,porto alegre,RS,2017-10-18 00:00:00,2017-10-18 00:00:00 66 | 65,22440,rio de janeiro,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 67 | 66,13323,salto,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 68 | 67,30190,belo horizonte,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 69 | 68,13212,jundiai,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 70 | 69,29307,cachoeiro de itapemirim,ES,2017-10-18 00:00:00,2017-10-18 00:00:00 71 | 70,12280,cacapava,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 72 | 71,60336,fortaleza,CE,2017-10-18 00:00:00,2017-10-18 00:00:00 73 | 72,11310,sao vicente,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 74 | 73,38408,uberlandia,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 75 | 74,37720,botelhos,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 76 | 75,24431,sao goncalo,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 77 | 76,05890,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 78 | 77,03733,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 79 | 78,83709,araucaria,PR,2017-10-18 00:00:00,2017-10-18 00:00:00 80 | 79,11347,sao vicente,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 81 | 80,26272,nova iguacu,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 82 | 81,05415,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 83 | 82,59655,areia branca,RN,2017-10-18 00:00:00,2017-10-18 00:00:00 84 | 83,04548,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 85 | 84,28010,campos dos goytacazes,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 86 | 85,13573,sao carlos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 87 | 86,02175,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 88 | 87,37500,itajuba,MG,2017-10-18 00:00:00,2017-10-18 00:00:00 89 | 88,90670,porto alegre,RS,2017-10-18 00:00:00,2017-10-18 00:00:00 90 | 89,09890,sao bernardo do campo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 91 | 90,13321,salto,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 92 | 91,44380,cruz das almas,BA,2017-10-18 00:00:00,2017-10-18 00:00:00 93 | 92,27700,vassouras,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 94 | 93,44033,feira de santana,BA,2017-10-18 00:00:00,2017-10-18 00:00:00 95 | 94,04537,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 96 | 95,71540,brasilia,DF,2017-10-18 00:00:00,2017-10-18 00:00:00 97 | 96,13569,sao carlos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 98 | 97,05565,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 99 | 98,03636,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00 100 | 99,24120,niteroi,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 101 | 100,24120,niteroi,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00 102 | -------------------------------------------------------------------------------- /1-basics-solutions.py: -------------------------------------------------------------------------------- 1 | 2 | # Variable: A storage location identified by its name, containing some value. 3 | 4 | # Value of 10 is assigned to variable a and 20 to variable b 5 | a = 10 6 | b = 20 7 | 8 | # We can do any operation (arithmetic for numbers, string transformation for text) on variables 9 | 10 | # Question: What is the result of a + b? 11 | c = a + b 12 | print(c) # Will print 30 13 | 14 | s = ' Some string ' 15 | # We can perform an operation on this string, for example, let's remove the empty spaces in front of and behind the string 16 | 17 | # Question: How do you remove the empty spaces in front of and behind the string s? 18 | print(s.strip()) 19 | 20 | # Data Structures are ways of representing data, each has its own pros and cons and places that they are the right fit. 21 | 22 | ## List: A collection of elements that can be accessed by knowing the location (aka index) of the element 23 | l = [1, 2, 3, 4] 24 | 25 | # Question: How do you access the elements in index 0 and 3? 26 | print(l[0]) # Will print 1 27 | print(l[3]) # Will print 4 28 | ## NOTE: lists retain the order of elements in it but dictionary doesn't 29 | 30 | ## Dictionary: A collection of key-value pairs, where each key is mapped to a value using a hash function. Provides fast data retrieval based on keys. 31 | d = {'a': 1, 'b': 2} 32 | 33 | # Question: How do you access the values associated with keys 'a' and 'b'? 34 | print(d.get('a')) # Will print 1 35 | print(d.get('b')) # Will print 2 36 | ## NOTE: The dictionary cannot have duplicate keys 37 | 38 | ## Set: A collection of unique elements that do not allow duplicates 39 | my_set = set() 40 | my_set.add(10) 41 | my_set.add(10) 42 | my_set.add(10) 43 | 44 | # Question: What will be the output of my_set? 45 | print(my_set) # This will only show 10, since the set only keeps unique values 46 | 47 | ## Tuple: A collection of immutable (non-changeable) elements, tuples retain their order once created. 48 | my_tuple = (1, 'hello', 3.14) 49 | 50 | # Question: What is the value of my_tuple? 51 | print(my_tuple) # Output: (1, 'hello', 3.14) 52 | 53 | ## Accessing elements by index 54 | 55 | # Question: How do you access the elements in index 0 and 1 of my_tuple? 56 | print(my_tuple[0]) # Output: 1 57 | print(my_tuple[1]) # Output: 'hello' 58 | 59 | ## Counting occurrences of an element 60 | count_tuple = (1, 2, 3, 1, 1, 2) 61 | 62 | # Question: How many times does the number 1 appear in count_tuple? 63 | print(count_tuple.count(1)) # Output: 3 64 | 65 | ## Finding the index of an element 66 | 67 | # Question: What is the index of the first occurrence of the number 2 in count_tuple? 68 | print(count_tuple.index(2)) # Output: 1 69 | 70 | # Loop allows a specific chunk of code to be repeated a certain number of times 71 | # Example: We can use a loop to print numbers 0 through 10 72 | for i in range(11): 73 | print(i) 74 | 75 | ## We can loop through our data structures as shown below 76 | # Question: How do you loop through a list and print its elements? 77 | for elt in l: 78 | print(elt) # Or any operation you may want to do 79 | ## We can do a similar loop for tuples and sets 80 | 81 | ## Dictionary loop 82 | # Question: How do you loop through a dictionary and print its keys and values? 83 | for k, v in d.items(): 84 | print(f'Key: {k}, Value: {v}') # Print key and values in dictionary 85 | 86 | ## Comprehension is a shorthand way of writing a loop 87 | ## For example, we can use the below to multiply every element in list l with 2 88 | print([elt*2 for elt in l]) 89 | 90 | # Functions: A block of code that can be re-used as needed. This allows for us to have logic defined in one place, making it easy to maintain and use. 91 | ## For example, let's create a simple function that takes a list as an input and returns another list whose values are greater than 3 92 | 93 | def gt_three(input_list): 94 | return [elt for elt in input_list if elt > 3] 95 | ## NOTE: we use list comprehension with filtering in the above function 96 | 97 | list_1 = [1, 2, 3, 4, 5, 6] 98 | # Question: How do you use the gt_three function to filter elements greater than 3 from list_1? 99 | print(gt_three(list_1)) # Will print [4, 5, 6] 100 | 101 | list_2 = [1, 2, 3, 1, 1, 1] 102 | # Question: What will be the output of gt_three(list_2)? 103 | print(gt_three(list_2)) # Will print [] 104 | 105 | # Classes and Objects 106 | # Think of a class as a blueprint and objects as things created based on that blueprint 107 | # You can define classes in Python as shown below 108 | 109 | class DataExtractor: 110 | 111 | def __init__(self, some_value): 112 | self.some_value = some_value 113 | 114 | def get_connection(self): 115 | # Some logic 116 | # some_value is accessible using self.some_value 117 | pass 118 | 119 | def close_connection(self): 120 | # Some logic 121 | # some_value is accessible using self.some_value 122 | pass 123 | 124 | # Question: How do you create a DataExtractor object and print its some_value attribute? 125 | de_object = DataExtractor(10) 126 | print(de_object.some_value) # Will print 10 127 | 128 | # Libraries are code that can be reused. 129 | 130 | # Python comes with some standard libraries to do common operations, 131 | # such as the datetime library to work with time (although there are better libraries) 132 | from datetime import datetime # You can import library or your code from another file with the import statement 133 | 134 | # Question: How do you print the current date in the format 'YYYY MM DD'? 135 | print(datetime.now().strftime('%Y %m %d')) # We can use multiple such methods 136 | 137 | # Exception handling: When an error occurs, we need our code to gracefully handle it without just stopping. 138 | ## Here is how we can handle errors when the program is running 139 | try: 140 | # Code that might raise an exception 141 | pass 142 | except Exception as e: 143 | # Code that runs if the exception occurs 144 | pass 145 | else: 146 | # Code that runs if no exception occurs 147 | pass 148 | finally: 149 | # Code that always runs, regardless of exceptions 150 | pass 151 | 152 | ## For example, let's consider exception handling on accessing an element that is not present in a list l 153 | l = [1, 2, 3, 4, 5] 154 | 155 | # Question: How do you handle an IndexError when accessing an invalid index in a list? 156 | index = 10 157 | try: 158 | # Attempt to access an element at an invalid index 159 | element = l[index] 160 | print(f"Element at index {index} is {element}") 161 | except IndexError: 162 | print(f"Error: Index {index} is out of range for the list.") 163 | finally: 164 | print("Execution completed.") 165 | # NOTE: in the except block its preferred to specify the exact erro/exception that you want to handle 166 | -------------------------------------------------------------------------------- /3-data-transform-solutions.py: -------------------------------------------------------------------------------- 1 | print( 2 | "################################################################################" 3 | ) 4 | print("Use standard python libraries to do the transformations") 5 | print( 6 | "################################################################################" 7 | ) 8 | import csv 9 | 10 | # Question: How do you read data from a CSV file into a list of dictionaries? 11 | data = [] 12 | with open("./data/sample_data.csv", "r", newline="") as csvfile: 13 | reader = csv.DictReader(csvfile) 14 | for row in reader: 15 | data.append(row) 16 | 17 | # Question: How do you remove duplicate rows based on customer ID? 18 | data_unique = [] 19 | customer_ids_seen = set() 20 | for row in data: 21 | if row["Customer_ID"] not in customer_ids_seen: 22 | data_unique.append(row) 23 | customer_ids_seen.add(row["Customer_ID"]) 24 | else: 25 | print(f'duplicate customer id {row["Customer_ID"]}') 26 | 27 | # Question: How do you handle missing values by replacing them with 0? 28 | for row in data_unique: 29 | if not row["Age"]: 30 | print(f'Customer {row["Customer_Name"]} does not have Age value') 31 | row["Age"] = 0 32 | if not row["Purchase_Amount"]: 33 | row["Purchase_Amount"] = 0.0 34 | 35 | # Question: How do you remove outliers such as age > 100 or purchase amount > 1000? 36 | data_cleaned = [ 37 | row 38 | for row in data_unique 39 | if int(row["Age"]) <= 100 and float(row["Purchase_Amount"]) <= 1000 40 | ] 41 | 42 | # Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male)? 43 | for row in data_cleaned: 44 | if row["Gender"] == "Female": 45 | row["Gender"] = 0 46 | elif row["Gender"] == "Male": 47 | row["Gender"] = 1 48 | 49 | # Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns? 50 | for row in data_cleaned: 51 | first_name, last_name = row["Customer_Name"].split(" ", 1) 52 | row["First_Name"] = first_name 53 | row["Last_Name"] = last_name 54 | del row["Customer_Name"] 55 | 56 | # Question: How do you calculate the total purchase amount by Gender? 57 | total_purchase_by_gender = {} 58 | for row in data_cleaned: 59 | total_purchase_by_gender[row["Gender"]] += float(row["Purchase_Amount"]) 60 | 61 | # Question: How do you calculate the average purchase amount by Age group? 62 | # assume age_groups is the grouping we want 63 | # hint: Why do we convert to float? 64 | age_groups = {"18-30": [], "31-40": [], "41-50": [], "51-60": [], "61-70": []} 65 | for row in data_cleaned: 66 | age = int(row["Age"]) 67 | if age <= 30: 68 | age_groups["18-30"].append(float(row["Purchase_Amount"])) 69 | elif age <= 40: 70 | age_groups["31-40"].append(float(row["Purchase_Amount"])) 71 | elif age <= 50: 72 | age_groups["41-50"].append(float(row["Purchase_Amount"])) 73 | elif age <= 60: 74 | age_groups["51-60"].append(float(row["Purchase_Amount"])) 75 | else: 76 | age_groups["61-70"].append(float(row["Purchase_Amount"])) 77 | 78 | average_purchase_by_age_group = { 79 | group: sum(amounts) / len(amounts) for group, amounts in age_groups.items() 80 | } 81 | 82 | # Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group? 83 | print("Total purchase amount by Gender:", total_purchase_by_gender) 84 | print("Average purchase amount by Age group:", average_purchase_by_age_group) 85 | 86 | print( 87 | "################################################################################" 88 | ) 89 | print("Use DuckDB to do the transformations") 90 | print( 91 | "################################################################################" 92 | ) 93 | 94 | import duckdb 95 | 96 | # Question: How do you connect to DuckDB and load data from a CSV file into a DuckDB table? 97 | # Connect to DuckDB and load data 98 | con = duckdb.connect(database=":memory:", read_only=False) 99 | con.execute( 100 | "CREATE TABLE data (Customer_ID INTEGER, Customer_Name VARCHAR, Age INTEGER, Gender VARCHAR, Purchase_Amount FLOAT, Purchase_Date DATE)" 101 | ) 102 | 103 | # Read data from CSV file into DuckDB table 104 | con.execute("COPY data FROM './data/sample_data.csv' WITH HEADER CSV") 105 | 106 | # Question: How do you remove duplicate rows based on customer ID in DuckDB? 107 | con.execute("CREATE TABLE data_unique AS SELECT DISTINCT * FROM data") 108 | 109 | # Question: How do you handle missing values by replacing them with 0 in DuckDB? 110 | con.execute( 111 | "CREATE TABLE data_cleaned_missing AS SELECT \ 112 | Customer_ID, Customer_Name, \ 113 | COALESCE(Age, 0) AS Age, \ 114 | Gender, \ 115 | COALESCE(Purchase_Amount, 0.0) AS Purchase_Amount, \ 116 | Purchase_Date \ 117 | FROM data_unique" 118 | ) 119 | 120 | # Question: How do you remove outliers (e.g., age > 100 or purchase amount > 1000) in DuckDB? 121 | con.execute( 122 | "CREATE TABLE data_cleaned_outliers AS SELECT * FROM data_cleaned_missing \ 123 | WHERE Age <= 100 AND Purchase_Amount <= 1000" 124 | ) 125 | 126 | # Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male) in DuckDB? 127 | con.execute( 128 | "CREATE TABLE data_cleaned_gender AS SELECT *, \ 129 | CASE WHEN Gender = 'Female' THEN 0 ELSE 1 END AS Gender_Binary \ 130 | FROM data_cleaned_outliers" 131 | ) 132 | 133 | # Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns in DuckDB? 134 | con.execute( 135 | "CREATE TABLE data_cleaned AS SELECT \ 136 | Customer_ID, \ 137 | SPLIT_PART(Customer_Name, ' ', 1) AS First_Name, \ 138 | SPLIT_PART(Customer_Name, ' ', 2) AS Last_Name, \ 139 | Age, Gender_Binary, Purchase_Amount, Purchase_Date \ 140 | FROM data_cleaned_gender" 141 | ) 142 | 143 | # Question: How do you calculate the total purchase amount by Gender in DuckDB? 144 | total_purchase_by_gender = con.execute( 145 | "SELECT Gender_Binary, SUM(Purchase_Amount) AS Total_Purchase_Amount \ 146 | FROM data_cleaned_gender \ 147 | GROUP BY Gender_Binary" 148 | ).fetchall() 149 | 150 | # Question: How do you calculate the average purchase amount by Age group in DuckDB? 151 | average_purchase_by_age_group = con.execute( 152 | "SELECT CASE \ 153 | WHEN Age BETWEEN 18 AND 30 THEN '18-30' \ 154 | WHEN Age BETWEEN 31 AND 40 THEN '31-40' \ 155 | WHEN Age BETWEEN 41 AND 50 THEN '41-50' \ 156 | WHEN Age BETWEEN 51 AND 60 THEN '51-60' \ 157 | ELSE '61-70' END AS Age_Group, \ 158 | AVG(Purchase_Amount) AS Average_Purchase_Amount \ 159 | FROM data_cleaned \ 160 | GROUP BY Age_Group" 161 | ).fetchall() 162 | 163 | # Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group in DuckDB? 164 | print("====================== Results ======================") 165 | print("Total purchase amount by Gender:") 166 | print(total_purchase_by_gender) 167 | print("Average purchase amount by Age group:") 168 | print(average_purchase_by_age_group) 169 | --------------------------------------------------------------------------------