├── __init__.py
├── .tool-versions
├── .gitignore
├── assets
    ├── cs.png
    ├── cs2.png
    └── repl.png
├── cleaning_functions.py
├── 4-data-quality-questions.py
├── tests
    ├── test_cleaning_functions-questions.py
    └── test_cleaning_functions_solutions.py
├── requirements.txt
├── 4-data-quality-solutions.py
├── README.md
├── 2-data-extract-load-questions.py
├── 3-data-transform-questions.py
├── 1-basics-questions.py
├── data
    ├── sample_data.csv
    └── customers.csv
├── setup_db.py
├── 2-data-extract-load-solutions.py
├── 1-basics-solutions.py
└── 3-data-transform-solutions.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.tool-versions:
--------------------------------------------------------------------------------
1 | python 3.11.1
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | duckdb.db
2 | tpch.db
3 | myenv/
4 | 
5 | __pycache__/
6 | 


--------------------------------------------------------------------------------
/assets/cs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josephmachado/python_essentials_for_data_engineers/HEAD/assets/cs.png


--------------------------------------------------------------------------------
/assets/cs2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josephmachado/python_essentials_for_data_engineers/HEAD/assets/cs2.png


--------------------------------------------------------------------------------
/assets/repl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josephmachado/python_essentials_for_data_engineers/HEAD/assets/repl.png


--------------------------------------------------------------------------------
/cleaning_functions.py:
--------------------------------------------------------------------------------
 1 | # Simple function to remove duplicates
 2 | 
 3 | 
 4 | def remove_duplicates(data, unique_key):
 5 |     data_unique = []
 6 |     unique_key_set = set()
 7 | 
 8 |     for row in data:
 9 |         if row[unique_key] not in unique_key_set:
10 |             data_unique.append(row)
11 |             unique_key_set.add(row[unique_key])
12 |     else:
13 |         print(f"duplicate customer id")
14 | 
15 |     return data_unique
16 | 


--------------------------------------------------------------------------------
/4-data-quality-questions.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | from cuallee import Check, CheckLevel
 3 | 
 4 | # Read CSV file into Polars DataFrame
 5 | df = pl.read_csv("./data/sample_data.csv")
 6 | 
 7 | # Question: Check for Nulls on column Id and that Customer_ID column is unique
 8 | # check docs at https://canimus.github.io/cuallee/polars/ on how to define a check and run it.
 9 | # you will end up with a dataframe of results, check that the `status` column does not have any "FAIL" in it
10 | 
11 | 


--------------------------------------------------------------------------------
/tests/test_cleaning_functions-questions.py:
--------------------------------------------------------------------------------
 1 | from cleaning_functions import remove_duplicates
 2 | 
 3 | 
 4 | def test_remove_duplicates():
 5 |     # Define sample data and unique_key
 6 |     data = []
 7 |     unique_key = ""
 8 | 
 9 |     # Call the function
10 |     unique_data = remove_duplicates(data, unique_key)
11 | 
12 |     # Assert that duplicates were removed
13 |     # assert len(unique_data) == some number based on your input
14 |     # Assert the actual values
15 |     expected_data = [
16 |     ]
17 |     assert unique_data == expected_data
18 | 
19 | 
20 | # Run this with the command python -m pytest ./tests
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | awswrangler==3.7.3
 2 | black==24.4.2
 3 | boto3==1.34.117
 4 | botocore==1.34.117
 5 | certifi==2024.6.2
 6 | charset-normalizer==3.3.2
 7 | click==8.1.7
 8 | cuallee==0.10.3
 9 | duckdb==1.0.0
10 | idna==3.7
11 | iniconfig==2.0.0
12 | isort==5.13.2
13 | jmespath==1.0.1
14 | mypy-extensions==1.0.0
15 | numpy==1.26.4
16 | packaging==24.0
17 | pandas==2.2.2
18 | pathspec==0.12.1
19 | platformdirs==4.2.2
20 | pluggy==1.5.0
21 | polars==0.20.31
22 | pyarrow==16.1.0
23 | pytest==8.2.1
24 | python-dateutil==2.9.0.post0
25 | pytz==2024.1
26 | requests==2.32.3
27 | s3transfer==0.10.1
28 | six==1.16.0
29 | toolz==0.12.1
30 | typing_extensions==4.12.1
31 | tzdata==2024.1
32 | urllib3==2.2.1
33 | 


--------------------------------------------------------------------------------
/4-data-quality-solutions.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | from cuallee import Check, CheckLevel
 3 | 
 4 | # Read CSV file into Polars DataFrame
 5 | df = pl.read_csv("./data/sample_data.csv")
 6 | 
 7 | # Question: Check for Nulls on column Id and that Customer_ID column is unique
 8 | # check docs at https://canimus.github.io/cuallee/polars/
 9 | # you will end up with a dataframe of results, check that the `status` column does not have any "FAIL" in it
10 | 
11 | check = Check(CheckLevel.ERROR, "Completeness")
12 | validation_results_df = (
13 |     check.is_complete("Customer_ID").is_unique("Customer_ID").validate(df)
14 | )
15 | print(validation_results_df)
16 | 
17 | results = validation_results_df["status"].to_list()
18 | assert "FAIL" not in results == True
19 | 


--------------------------------------------------------------------------------
/tests/test_cleaning_functions_solutions.py:
--------------------------------------------------------------------------------
 1 | from cleaning_functions import remove_duplicates
 2 | 
 3 | 
 4 | def test_remove_duplicates():
 5 |     # Define sample data and unique key
 6 |     data = [
 7 |         {"Customer_ID": 1, "Name": "Alice"},
 8 |         {"Customer_ID": 2, "Name": "Bob"},
 9 |         {"Customer_ID": 1, "Name": "Alice"},  # Duplicate
10 |         {"Customer_ID": 3, "Name": "Charlie"},
11 |         {"Customer_ID": 2, "Name": "Bob"},  # Duplicate
12 |     ]
13 |     unique_key = "Customer_ID"
14 | 
15 |     # Call the function
16 |     unique_data = remove_duplicates(data, unique_key)
17 | 
18 |     # Assert that duplicates were removed
19 |     assert len(unique_data) == 3
20 |     # Assert the actual values
21 |     expected_data = [
22 |         {"Customer_ID": 1, "Name": "Alice"},
23 |         {"Customer_ID": 2, "Name": "Bob"},
24 |         {"Customer_ID": 3, "Name": "Charlie"},
25 |     ]
26 |     assert unique_data == expected_data
27 | 
28 | 
29 | # Run this with the command python -m pytest ./tests
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | * [Python Essentials for Data Engineers](#python-essentials-for-data-engineers)
 4 |     * [Run on Codespaces](#run-on-codespaces)
 5 |     * [Running on your laptop](#running-on-your-laptop)
 6 |     * [Using python REPL](#using-python-repl)
 7 | 
 8 | Code for Blog at: [Python Essentials for Data Engineers](https://www.startdataengineering.com/post/python-for-de/).
 9 | 
10 | # Python Essentials for Data Engineers 
11 | 
12 | ## Run on Codespaces
13 | 
14 | Open codespaces and wait for codespaces to setup. The process of opening codespaces and waiting for completion is shown below.
15 | 
16 | **NOTE**: Make sure to turn off codespaces, you only have limited free usage per month.
17 | 
18 | ![Open codespace](./assets/cs.png)
19 | ![Wait for codespace to setup](./assets/cs2.png)
20 | 
21 | ## Running on your laptop
22 | 
23 | Clone the repo, cd into it and setup the virtual environment as shown below.
24 | 
25 | ```bash
26 | git clone https://github.com/josephmachado/python_essentials_for_data_engineers.git
27 | cd python_essentials_for_data_engineers
28 | 
29 | python -m venv myenv
30 | source myenv/bin/activate
31 | pip install -r requirements.txt
32 | 
33 | # open python REPL with 
34 | python
35 | ```
36 | 
37 | ## Using python REPL
38 | 
39 | ![REPL](./assets/repl.png)
40 | 
41 | In the Python REPL you can try out the commands and do the exercises.
42 | 
43 | To run pytest (under ./tests folder) you will need to run the `python -m pytest ./tests` command.
44 | 
45 | The questions are files with the prefix `-questions.py`, use these as starting points to practice python for data engineering. While the workbooks have solutions, there are multiple ways to do the same thing, and as long as you get the correct answer, you should be good.
46 | 


--------------------------------------------------------------------------------
/2-data-extract-load-questions.py:
--------------------------------------------------------------------------------
 1 | # Extract: Process to pull data from Source system
 2 | # Load: Process to write data to a destination system
 3 | 
 4 | # Common upstream & downstream systems
 5 | # OLTP Databases: Postgres, MySQL, sqlite3, etc
 6 | # OLAP Databases: Snowflake, BigQuery, Clickhouse, DuckDB, etc
 7 | # Cloud data storage: AWS S3, GCP Cloud Store, Minio, etc
 8 | # Queue systems: Kafka, Redpanda, etc
 9 | # API
10 | # Local disk: csv, excel, json, xml files
11 | # SFTP\FTP server
12 | 
13 | # Databases: When reading or writing to a database we use a database driver. Database drivers are libraries that we can use to read or write to a database.
14 | # Question: How do you read data from a sqlite3 database and write to a DuckDB database?
15 | # Hint: Look at importing the database libraries for sqlite3 and duckdb and create connections to talk to the respective databases
16 | 
17 | # Fetch data from the SQLite Customer table
18 | 
19 | # Insert data into the DuckDB Customer table
20 | 
21 | # Hint: Look for Commit and close the connections
22 | # Commit tells the DB connection to send the data to the database and commit it, if you don't commit the data will not be inserted
23 | 
24 | # We should close the connection, as DB connections are expensive
25 | 
26 | # Cloud storage
27 | # Question: How do you read data from the S3 location given below and write the data to a DuckDB database?
28 | # Data source: https://docs.opendata.aws/noaa-ghcn-pds/readme.html station data at path "csv.gz/by_station/ASN00002022.csv.gz"
29 | # Hint: Use boto3 client with UNSIGNED config to access the S3 bucket
30 | # Hint: The data will be zipped you have to unzip it and decode it to utf-8
31 | 
32 | # AWS S3 bucket and file details
33 | bucket_name = "noaa-ghcn-pds"
34 | file_key = "csv.gz/by_station/ASN00002022.csv.gz"
35 | # Create a boto3 client with anonymous access
36 | 
37 | # Download the CSV file from S3
38 | # Decompress the gzip data
39 | # Read the CSV file using csv.reader
40 | # Connect to the DuckDB database (assume WeatherData table exists)
41 | 
42 | # Insert data into the DuckDB WeatherData table
43 | 
44 | # API
45 | # Question: How do you read data from the CoinCap API given below and write the data to a DuckDB database?
46 | # URL: "https://api.coincap.io/v2/exchanges"
47 | # Hint: use requests library
48 | 
49 | # Define the API endpoint
50 | url = "https://api.coincap.io/v2/exchanges"
51 | 
52 | # Fetch data from the CoinCap API
53 | # Connect to the DuckDB database
54 | 
55 | # Insert data into the DuckDB Exchanges table
56 | # Prepare data for insertion
57 | # Hint: Ensure that the data types of the data to be inserted is compatible with DuckDBs data column types in ./setup_db.py
58 | 
59 | 
60 | # Local disk
61 | # Question: How do you read a CSV file from local disk and write it to a database?
62 | # Look up open function with csvreader for python
63 | 
64 | # Web scraping
65 | # Questions: Use beatiful soup to scrape the below website and print all the links in that website
66 | # URL of the website to scrape
67 | url = 'https://example.com'
68 | 


--------------------------------------------------------------------------------
/3-data-transform-questions.py:
--------------------------------------------------------------------------------
 1 | print(
 2 |     "################################################################################"
 3 | )
 4 | print("Use standard python libraries to do the transformations")
 5 | print(
 6 |     "################################################################################"
 7 | )
 8 | 
 9 | # Question: How do you read data from a CSV file at ./data/sample_data.csv into a list of dictionaries?
10 | 
11 | # Question: How do you remove duplicate rows based on customer ID?
12 | 
13 | # Question: How do you handle missing values by replacing them with 0?
14 | 
15 | # Question: How do you remove outliers such as age > 100 or purchase amount > 1000?
16 | 
17 | # Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male)?
18 | 
19 | # Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns?
20 | 
21 | # Question: How do you calculate the total purchase amount by Gender?
22 | 
23 | # Question: How do you calculate the average purchase amount by Age group?
24 | # assume age_groups is the grouping we want
25 | # hint: Why do we convert to float?
26 | age_groups = {"18-30": [], "31-40": [], "41-50": [], "51-60": [], "61-70": []}
27 | 
28 | # Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group?
29 | your_total_purchase_amount_by_gender = {} # your results should be assigned to this variable
30 | average_purchase_by_age_group = {} # your results should be assigned to this variable
31 | 
32 | print(f"Total purchase amount by Gender: {your_total_purchase_amount_by_gender}")
33 | print(f"Average purchase amount by Age group: {average_purchase_by_age_group}")
34 | 
35 | print(
36 |     "################################################################################"
37 | )
38 | print("Use DuckDB to do the transformations")
39 | print(
40 |     "################################################################################"
41 | )
42 | 
43 | # Question: How do you connect to DuckDB and load data from a CSV file into a DuckDB table?
44 | # Connect to DuckDB and load data
45 | 
46 | # Read data from CSV file into DuckDB table
47 | 
48 | # Question: How do you remove duplicate rows based on customer ID in DuckDB?
49 | 
50 | # Question: How do you handle missing values by replacing them with 0 in DuckDB?
51 | 
52 | # Question: How do you remove outliers (e.g., age > 100 or purchase amount > 1000) in DuckDB?
53 | 
54 | # Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male) in DuckDB?
55 | 
56 | # Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns in DuckDB?
57 | 
58 | # Question: How do you calculate the total purchase amount by Gender in DuckDB?
59 | 
60 | # Question: How do you calculate the average purchase amount by Age group in DuckDB?
61 | 
62 | # Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group in DuckDB?
63 | print("====================== Results ======================")
64 | print("Total purchase amount by Gender:")
65 | print("Average purchase amount by Age group:")
66 | 


--------------------------------------------------------------------------------
/1-basics-questions.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Variable: A storage location identified by its name, containing some value.
  3 | # Question: Assign a value of 10 to variable a and 20 to variable b
  4 | # Question: Store the result of a + b in a variable c and print it. What is the result of a + b?
  5 | 
  6 | s = '  Some string '
  7 | # Question: How do you remove the empty spaces in front of and behind the string s?
  8 | print(s.strip())
  9 | 
 10 | # Data Structures are ways of representing data, each has its own pros and cons and places that they are the right fit.
 11 | ## List: A collection of elements that can be accessed by knowing the location (aka index) of the element
 12 | l = [1, 2, 3, 4]
 13 | 
 14 | # Question: How do you access the elements in index 0 and 3? Print the results.
 15 | ## NOTE: lists retain the order of elements in it but dictionary doesn't
 16 | 
 17 | ## Dictionary: A collection of key-value pairs, where each key is mapped to a value using a hash function. Provides fast data retrieval based on keys.
 18 | d = {'a': 1, 'b': 2}
 19 | 
 20 | # Question: How do you access the values associated with keys 'a' and 'b'?
 21 | ## NOTE: The dictionary cannot have duplicate keys
 22 | 
 23 | ## Set: A collection of unique elements that do not allow duplicates
 24 | my_set = set()
 25 | my_set.add(10)
 26 | my_set.add(10)
 27 | my_set.add(10)
 28 | 
 29 | # Question: What will be the output of my_set?
 30 | 
 31 | ## Tuple: A collection of immutable (non-changeable) elements, tuples retain their order once created.
 32 | my_tuple = (1, 'hello', 3.14)
 33 | 
 34 | # Question: What is the value of my_tuple?
 35 | 
 36 | # Accessing elements by index
 37 | 
 38 | # Question: How do you access the elements in index 0 and 1 of my_tuple?
 39 | 
 40 | # Counting occurrences of an element
 41 | count_tuple = (1, 2, 3, 1, 1, 2)
 42 | 
 43 | # Question: How many times does the number 1 appear in count_tuple?
 44 | 
 45 | # Finding the index of an element
 46 | # Question: What is the index of the first occurrence of the number 2 in count_tuple?
 47 | 
 48 | # Loop allows a specific chunk of code to be repeated a certain number of times
 49 | # Example: We can use a loop to print numbers 0 through 10
 50 | for i in range(11):
 51 |     print(i)
 52 | 
 53 | # We can loop through our data structures as shown below
 54 | # Question: How do you loop through a list and print its elements?
 55 | 
 56 | # Dictionary loop
 57 | # Question: How do you loop through a dictionary and print its keys and values?
 58 | 
 59 | # Comprehension is a shorthand way of writing a loop
 60 | # Question: Multiply every element in list l with 2 and print the result
 61 | 
 62 | # Functions: A block of code that can be re-used as needed. This allows for us to have logic defined in one place, making it easy to maintain and use.
 63 | ## For example, let's create a simple function that takes a list as an input and returns another list whose values are greater than 3
 64 | 
 65 | def gt_three(input_list):
 66 |     return [elt for elt in input_list if elt > 3]
 67 | ## NOTE: we use list comprehension with filtering in the above function
 68 | 
 69 | list_1 = [1, 2, 3, 4, 5, 6]
 70 | # Question: How do you use the gt_three function to filter elements greater than 3 from list_1?
 71 | 
 72 | list_2 = [1, 2, 3, 1, 1, 1]
 73 | # Question: What will be the output of gt_three(list_2)?
 74 | 
 75 | # Classes and Objects
 76 | # Think of a class as a blueprint and objects as things created based on that blueprint
 77 | # You can define classes in Python as shown below
 78 | class DataExtractor:
 79 | 
 80 |     def __init__(self, some_value):
 81 |         self.some_value = some_value
 82 | 
 83 |     def get_connection(self):
 84 |         # Some logic
 85 |         # some_value is accessible using self.some_value
 86 |         pass
 87 | 
 88 |     def close_connection(self):
 89 |         # Some logic
 90 |         # some_value is accessible using self.some_value
 91 |         pass
 92 | 
 93 | # Question: How do you create a DataExtractor object and print its some_value attribute?
 94 | 
 95 | # Libraries are code that can be reused.
 96 | 
 97 | # Python comes with some standard libraries to do common operations, 
 98 | # such as the datetime library to work with time (although there are better libraries)
 99 | from datetime import datetime  # You can import library or your code from another file with the import statement
100 | 
101 | # Question: How do you print the current date in the format 'YYYY MM DD'? Hint: Google strftime
102 | 
103 | # Exception handling: When an error occurs, we need our code to gracefully handle it without just stopping. 
104 | # Here is how we can handle errors when the program is running
105 | try:
106 |     # Code that might raise an exception
107 |     pass
108 | except Exception as e: 
109 |     # Code that runs if the exception occurs
110 |     pass
111 | else:
112 |     # Code that runs if no exception occurs
113 |     pass
114 | finally:
115 |     # Code that always runs, regardless of exceptions
116 |     pass
117 | 
118 | # For example, let's consider exception handling on accessing an element that is not present in a list l
119 | l = [1, 2, 3, 4, 5]
120 | 
121 | # Question: How do you handle an IndexError when accessing an invalid index in a list?
122 | # NOTE: in the except block its preferred to specify the exact erro/exception that you want to handle
123 | 


--------------------------------------------------------------------------------
/data/sample_data.csv:
--------------------------------------------------------------------------------
  1 | Customer_ID,Customer_Name,Age,Gender,Purchase_Amount,Purchase_Date
  2 | 1,Henry Jones,32,Male,1080000.66,2023-08-15
  3 | 2,Emma Rodriguez,24,Male,62.4,2024-04-16
  4 | 3,Frank Martinez,20,Female,443.47,2024-05-16
  5 | 4,Alice Rodriguez,62,Female,729.69,2024-01-05
  6 | 5,Frank Miller,33,Female,651.2,2024-05-23
  7 | 6,Emma Garcia,22,Female,477.56,2023-09-02
  8 | 7,Grace Jones,53,Female,29.72,2023-10-01
  9 | 8,Henry Smith,41,Male,291.28,2024-02-01
 10 | 9,Emma Jones,50,Female,575.69,2023-12-11
 11 | 10,Ivy Martinez,44,Female,30.96,2024-05-11
 12 | 11,Grace Miller,29,Male,321.42,2023-10-25
 13 | 12,Ivy Martinez,68,Male,470.45,2023-06-05
 14 | 13,Charlie Jones,24,Female,330.87,2023-11-11
 15 | 14,Frank Rodriguez,51,Female,483.61,2024-05-06
 16 | 15,Alice Jones,53,Male,647.13,2023-12-07
 17 | 16,Grace Miller,36,Female,691.85,2023-09-25
 18 | 17,Charlie Miller,27,Male,840.67,2023-07-09
 19 | 18,Jack Martinez,64,Female,831.8,2023-11-07
 20 | 19,Emma Martinez,64,Female,399.76,2024-02-04
 21 | 20,Grace Miller,56,Male,917.93,2023-12-24
 22 | 21,Grace Rodriguez,21,Female,389.69,2023-07-19
 23 | 22,Ivy Brown,34,Male,697.0,2023-07-15
 24 | 23,Jack Miller,55,Male,838.01,2023-10-22
 25 | 24,Ivy Martinez,42,Male,620.28,2024-01-19
 26 | 25,David Johnson,41,Female,964.04,2023-12-30
 27 | 26,David Smith,36,Female,231.88,2024-05-27
 28 | 27,Ivy Martinez,22,Female,115.68,2023-08-24
 29 | 28,Henry Martinez,62,Male,735.66,2023-12-07
 30 | 29,Charlie Garcia,42,Female,696.19,2023-07-26
 31 | 30,Grace Davis,30,Female,566.13,2023-07-18
 32 | 31,Grace Smith,19,Male,567.3,2024-04-13
 33 | 32,Emma Jones,58,Male,299.82,2024-03-21
 34 | 33,Grace Rodriguez,43,Male,366.42,2023-07-30
 35 | 34,Alice Garcia,54,Male,874.08,2024-04-17
 36 | 35,Grace Williams,58,Male,139.01,2024-04-23
 37 | 36,Frank Johnson,54,Female,637.05,2023-08-06
 38 | 37,Ivy Miller,37,Male,787.06,2024-05-04
 39 | 38,Charlie Rodriguez,52,Female,853.47,2023-11-26
 40 | 39,Grace Garcia,23,Male,901.82,2023-12-09
 41 | 40,Emma Martinez,61,Male,180.92,2023-10-02
 42 | 41,Alice Jones,23,Male,373.03,2024-03-16
 43 | 42,Charlie Smith,66,Male,392.27,2023-11-03
 44 | 43,Bob Williams,35,Female,319.58,2024-03-29
 45 | 44,Frank Garcia,24,Male,917.63,2023-07-26
 46 | 45,Frank Martinez,38,Male,313.45,2024-01-14
 47 | 46,Ivy Brown,47,Female,349.88,2023-07-05
 48 | 47,Grace Rodriguez,23,Female,933.63,2024-03-18
 49 | 48,Alice Smith,23,Female,762.86,2024-03-04
 50 | 49,Alice Brown,29,Female,920.49,2024-04-30
 51 | 50,Bob Smith,38,Female,104.47,2023-12-31
 52 | 51,Frank Brown,46,Male,621.47,2023-12-30
 53 | 52,Frank Johnson,54,Female,281.35,2023-07-02
 54 | 53,Alice Miller,31,Female,396.19,2023-09-17
 55 | 54,Emma Martinez,39,Female,778.26,2024-05-08
 56 | 55,Grace Brown,50,Male,388.92,2023-12-20
 57 | 56,Henry Miller,56,Male,85.85,2024-02-26
 58 | 57,Charlie Martinez,40,Female,268.41,2023-08-29
 59 | 58,Alice Jones,50,Male,397.89,2023-10-04
 60 | 59,Grace Martinez,37,Male,588.02,2024-03-13
 61 | 60,David Davis,42,Male,698.82,2023-11-21
 62 | 61,Bob Garcia,18,Male,194.19,2023-12-23
 63 | 62,Alice Davis,50,Male,376.8,2024-03-20
 64 | 63,David Rodriguez,48,Male,447.09,2023-09-05
 65 | 64,Henry Brown,35,Male,823.0,2023-10-08
 66 | 65,Bob Garcia,32,Female,726.87,2024-01-06
 67 | 66,Alice Davis,68,Male,377.11,2024-02-22
 68 | 67,Frank Johnson,60,Female,335.99,2023-10-03
 69 | 68,Frank Miller,34,Female,873.36,2024-01-27
 70 | 69,Bob Martinez,50,Female,574.54,2024-05-09
 71 | 70,Bob Williams,48,Female,711.75,2024-04-11
 72 | 71,Henry Williams,41,Female,323.66,2023-08-20
 73 | 72,Frank Miller,66,Female,932.36,2024-02-19
 74 | 73,Jack Rodriguez,70,Male,114.88,2023-06-21
 75 | 74,Emma Johnson,21,Female,529.71,2023-08-11
 76 | 75,Ivy Williams,24,Male,779.41,2023-12-04
 77 | 76,David Rodriguez,23,Male,212.29,2024-03-16
 78 | 77,Frank Rodriguez,69,Female,586.67,2024-04-25
 79 | 78,Charlie Miller,24,Male,510.7,2024-02-04
 80 | 79,Bob Brown,27,Female,761.49,2024-05-20
 81 | 80,Charlie Jones,38,Female,282.35,2024-04-14
 82 | 81,Bob Rodriguez,54,Female,439.0,2024-03-09
 83 | 82,Jack Miller,64,Female,293.95,2023-10-25
 84 | 83,Alice Martinez,41,Female,185.83,2024-02-20
 85 | 84,David Jones,25,Male,578.27,2023-07-14
 86 | 85,Emma Jones,69,Female,480.21,2024-05-17
 87 | 86,Bob Garcia,34,Male,906.17,2024-01-27
 88 | 87,Henry Johnson,27,Female,586.82,2023-08-02
 89 | 88,Jack Rodriguez,30,Male,840.43,2023-07-27
 90 | 89,Alice Smith,19,Female,602.63,2024-04-01
 91 | 90,Ivy Davis,55,Female,405.7,2024-04-10
 92 | 91,Alice Jones,64,Female,960.03,2023-12-26
 93 | 92,Ivy Brown,19,Female,676.08,2023-12-18
 94 | 93,Bob Rodriguez,26,Male,891.46,2024-01-22
 95 | 94,Ivy Brown,56,Male,489.22,2024-05-03
 96 | 95,Jack Brown,51,Female,649.88,2024-05-13
 97 | 96,Grace Miller,42,Male,461.16,2023-12-03
 98 | 97,Ivy Brown,21,Male,465.45,2023-09-06
 99 | 98,Grace Davis,42,Male,796.25,2023-07-07
100 | 84,David Jones,25,Male,578.27,2023-07-14
101 | 85,Emma Jones,69,Female,480.21,2024-05-17
102 | 86,Bob Garcia,34,Male,906.17,2024-01-27
103 | 87,Henry Johnson,27,Female,586.82,2023-08-02
104 | 88,Jack Rodriguez,30,Male,840.43,2023-07-27
105 | 89,Alice Smith,19,Female,602.63,2024-04-01
106 | 90,Ivy Davis,55,Female,405.7,2024-04-10
107 | 91,Alice Jones,64,Female,960.03,2023-12-26
108 | 99,Alice Johnson,,Female,781.83,2023-06-04
109 | 100,Jack Garcia,,Female,269.64,2024-03-08
110 | 


--------------------------------------------------------------------------------
/setup_db.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | import sqlite3
  4 | 
  5 | 
  6 | def del_existing_db(db_path):
  7 |     # Delete file if it exists
  8 |     if os.path.exists(db_path):
  9 |         os.remove(db_path)
 10 |         print(f"{db_path} has been deleted.")
 11 |     else:
 12 |         print(f"{db_path} does not exist.")
 13 | 
 14 | 
 15 | del_existing_db("tpch.db")
 16 | del_existing_db("duckdb.db")
 17 | 
 18 | # Connect to SQLite database (or create it if it doesn't exist)
 19 | conn = sqlite3.connect("tpch.db")
 20 | cursor = conn.cursor()
 21 | 
 22 | # Create Customer table
 23 | cursor.execute("DROP TABLE IF EXISTS Customer")
 24 | cursor.execute(
 25 |     """
 26 | CREATE TABLE IF NOT EXISTS Customer (
 27 |     customer_id INTEGER PRIMARY KEY,
 28 |     zipcode TEXT,
 29 |     city TEXT,
 30 |     state_code TEXT,
 31 |     datetime_created TEXT,
 32 |     datetime_updated TEXT
 33 | )
 34 | """
 35 | )
 36 | 
 37 | 
 38 | # Function to read CSV and insert data into the table
 39 | def insert_data_from_csv(csv_file):
 40 |     with open(csv_file, "r") as file:
 41 |         reader = csv.DictReader(file)
 42 |         for row in reader:
 43 |             cursor.execute(
 44 |                 """
 45 |                 INSERT INTO Customer (customer_id, zipcode, city, state_code, datetime_created, datetime_updated)
 46 |                 VALUES (?, ?, ?, ?, ?, ?)
 47 |             """,
 48 |                 (
 49 |                     row["customer_id"],
 50 |                     row["zipcode"],
 51 |                     row["city"],
 52 |                     row["state_code"],
 53 |                     row["datetime_created"],
 54 |                     row["datetime_updated"],
 55 |                 ),
 56 |             )
 57 |     conn.commit()
 58 | 
 59 | 
 60 | # Insert data from CSV file
 61 | insert_data_from_csv("./data/customers.csv")
 62 | 
 63 | # Close the database connection
 64 | conn.close()
 65 | 
 66 | print("Data inserted successfully!")
 67 | 
 68 | 
 69 | import duckdb
 70 | 
 71 | # Connect to the DuckDB database (or create it if it doesn't exist)
 72 | duckdb_conn = duckdb.connect("duckdb.db")
 73 | 
 74 | # Create the Customer table in DuckDB
 75 | duckdb_conn.execute("DROP TABLE IF EXISTS Customer")
 76 | duckdb_conn.execute(
 77 |     """
 78 | CREATE TABLE IF NOT EXISTS Customer (
 79 |     customer_id INTEGER,
 80 |     zipcode TEXT,
 81 |     city TEXT,
 82 |     state_code TEXT,
 83 |     datetime_created TIMESTAMP,
 84 |     datetime_updated TIMESTAMP
 85 | )
 86 | """
 87 | )
 88 | 
 89 | duckdb_conn.execute("DROP TABLE IF EXISTS WeatherData")
 90 | duckdb_conn.execute(
 91 |     """
 92 | CREATE TABLE IF NOT EXISTS WeatherData (
 93 |     id TEXT,
 94 |     date TEXT,
 95 |     element TEXT,
 96 |     value INTEGER,
 97 |     m_flag TEXT,
 98 |     q_flag TEXT,
 99 |     s_flag TEXT,
100 |     obs_time TEXT
101 | )
102 | """
103 | )
104 | 
105 | duckdb_conn.execute("DROP TABLE IF EXISTS Exchanges")
106 | duckdb_conn.execute(
107 |     """
108 | CREATE TABLE IF NOT EXISTS Exchanges (
109 |     id TEXT,
110 |     name TEXT,
111 |     rank INTEGER,
112 |     percentTotalVolume FLOAT,
113 |     volumeUsd FLOAT,
114 |     tradingPairs TEXT,
115 |     socket BOOLEAN,
116 |     exchangeUrl TEXT,
117 |     updated BIGINT 
118 | )
119 | """
120 | )
121 | 
122 | # Commit and close the connection
123 | duckdb_conn.commit()
124 | duckdb_conn.close()
125 | 
126 | print("Customer table created successfully!")
127 | 
128 | import csv
129 | import datetime
130 | import random
131 | 
132 | 
133 | # Generate random customer names
134 | def generate_name():
135 |     first_names = [
136 |         "Alice",
137 |         "Bob",
138 |         "Charlie",
139 |         "David",
140 |         "Emma",
141 |         "Frank",
142 |         "Grace",
143 |         "Henry",
144 |         "Ivy",
145 |         "Jack",
146 |     ]
147 |     last_names = [
148 |         "Smith",
149 |         "Johnson",
150 |         "Williams",
151 |         "Brown",
152 |         "Jones",
153 |         "Garcia",
154 |         "Miller",
155 |         "Davis",
156 |         "Rodriguez",
157 |         "Martinez",
158 |     ]
159 |     return random.choice(first_names) + " " + random.choice(last_names)
160 | 
161 | 
162 | # Generate random age between 18 and 70
163 | def generate_age():
164 |     return random.randint(18, 70)
165 | 
166 | 
167 | # Generate random gender
168 | def generate_gender():
169 |     return random.choice(["Male", "Female"])
170 | 
171 | 
172 | # Generate random purchase amount between 10 and 1000
173 | def generate_purchase_amount():
174 |     return round(random.uniform(10, 1000), 2)
175 | 
176 | 
177 | # Generate random purchase date within the last year
178 | def generate_purchase_date():
179 |     end_date = datetime.date.today()
180 |     start_date = end_date - datetime.timedelta(days=365)
181 |     random_date = start_date + datetime.timedelta(days=random.randint(0, 365))
182 |     return random_date.strftime("%Y-%m-%d")
183 | 
184 | 
185 | # Generate CSV data
186 | with open("./data/sample_data.csv", "w", newline="") as csvfile:
187 |     fieldnames = [
188 |         "Customer_ID",
189 |         "Customer_Name",
190 |         "Age",
191 |         "Gender",
192 |         "Purchase_Amount",
193 |         "Purchase_Date",
194 |     ]
195 |     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
196 | 
197 |     writer.writeheader()
198 |     for i in range(100):
199 |         writer.writerow(
200 |             {
201 |                 "Customer_ID": i + 1,
202 |                 "Customer_Name": generate_name(),
203 |                 "Age": generate_age(),
204 |                 "Gender": generate_gender(),
205 |                 "Purchase_Amount": generate_purchase_amount(),
206 |                 "Purchase_Date": generate_purchase_date(),
207 |             }
208 |         )
209 | 
210 | print("CSV file generated successfully!")
211 | 


--------------------------------------------------------------------------------
/2-data-extract-load-solutions.py:
--------------------------------------------------------------------------------
  1 | # Extract: Process to pull data from Source system
  2 | # Load: Process to write data to a destination system
  3 | 
  4 | # Common upstream & downstream systems
  5 | # OLTP Databases: Postgres, MySQL, sqlite3, etc
  6 | # OLAP Databases: Snowflake, BigQuery, Clickhouse, DuckDB, etc
  7 | # Cloud data storage: AWS S3, GCP Cloud Store, Minio, etc
  8 | # Queue systems: Kafka, Redpanda, etc
  9 | # API
 10 | # Local disk: csv, excel, json, xml files
 11 | # SFTP\FTP server
 12 | 
 13 | # Databases: When reading or writing to a database we use a database driver. Database drivers are libraries that we can use to read or write to a database.
 14 | # Question: How do you read data from a sqlite3 database and write to a DuckDB database?
 15 | import sqlite3  # we import the sqlite3 database driver
 16 | 
 17 | # Connect to the SQLite database
 18 | sqlite_conn = sqlite3.connect(
 19 |     "tpch.db"
 20 | )  # Typically this will involve a connection string, sqlite3 db is stored as a file
 21 | 
 22 | # Fetch data from the SQLite Customer table using conn.execute
 23 | customers = sqlite_conn.execute(
 24 |     "SELECT * FROM Customer"
 25 | ).fetchall()  # Fetch data from the SQLite Customer table
 26 | 
 27 | import duckdb  # duckdb database driver
 28 | 
 29 | duckdb_conn = duckdb.connect("duckdb.db")  # Duckdb connection string
 30 | # Insert data into the DuckDB Customer table
 31 | insert_query = f"""
 32 | INSERT INTO Customer (customer_id, zipcode, city, state_code, datetime_created, datetime_updated)
 33 | VALUES (?, ?, ?, ?, ?, ?)
 34 | """  # Insert into query
 35 | 
 36 | duckdb_conn.executemany(insert_query, customers)
 37 | 
 38 | # Commit and close the connections
 39 | # Commit tells the DB connection to send the data to the database and commit it, if you don't commit the data will not be inserted
 40 | duckdb_conn.commit()
 41 | 
 42 | # We should close the connection, as DB connections are expensive
 43 | sqlite_conn.close()
 44 | duckdb_conn.close()
 45 | 
 46 | # Cloud storage
 47 | # Question: How do you read data from the S3 location given below and write the data to a DuckDB database?
 48 | # Data source: https://docs.opendata.aws/noaa-ghcn-pds/readme.html station data at path "csv.gz/by_station/ASN00002022.csv.gz"
 49 | # Hint: Use boto3 client with UNSIGNED config to access the S3 bucket
 50 | # Hint: The data will be zipped you have to unzip it
 51 | 
 52 | import csv
 53 | import gzip
 54 | from io import StringIO
 55 | 
 56 | import boto3
 57 | import duckdb
 58 | from botocore import UNSIGNED
 59 | from botocore.client import Config
 60 | 
 61 | # AWS S3 bucket and file details
 62 | bucket_name = "noaa-ghcn-pds"
 63 | file_key = "csv.gz/by_station/ASN00002022.csv.gz"
 64 | # Create a boto3 client with anonymous access
 65 | s3_client = boto3.client("s3", config=Config(signature_version=UNSIGNED))
 66 | 
 67 | # Download the CSV file from S3
 68 | response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
 69 | compressed_data = response["Body"].read()
 70 | 
 71 | # Decompress the gzip data
 72 | csv_data = gzip.decompress(compressed_data).decode("utf-8")
 73 | 
 74 | # Read the CSV file using csv.reader
 75 | csv_reader = csv.reader(StringIO(csv_data))
 76 | data = list(csv_reader)
 77 | # Connect to the DuckDB database (assume WeatherData table exists)
 78 | duckdb_conn = duckdb.connect("duckdb.db")
 79 | 
 80 | # Insert data into the DuckDB WeatherData table
 81 | insert_query = """
 82 | INSERT INTO WeatherData (id, date, element, value, m_flag, q_flag, s_flag, obs_time)
 83 | VALUES (?, ?, ?, ?, ?, ?, ?, ?)
 84 | """
 85 | 
 86 | duckdb_conn.executemany(insert_query, data[:100000])
 87 | 
 88 | # Commit and close the connection
 89 | duckdb_conn.commit()
 90 | duckdb_conn.close()
 91 | 
 92 | # API
 93 | # Question: How do you read data from the CoinCap API given below and write the data to a DuckDB database?
 94 | # URL: "https://api.coincap.io/v2/exchanges"
 95 | # Hint: use requests library
 96 | 
 97 | import duckdb
 98 | import requests
 99 | 
100 | # Define the API endpoint
101 | url = "https://api.coincap.io/v2/exchanges"
102 | 
103 | # Fetch data from the CoinCap API
104 | response = requests.get(url)
105 | data = response.json()["data"]
106 | 
107 | # Connect to the DuckDB database
108 | duckdb_conn = duckdb.connect("duckdb.db")
109 | 
110 | # Insert data into the DuckDB Exchanges table
111 | insert_query = """
112 | INSERT INTO Exchanges (id, name, rank, percentTotalVolume, volumeUsd, tradingPairs, socket, exchangeUrl, updated)
113 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
114 | """
115 | # Prepare data for insertion
116 | # Hint: Why are we changing the data type?
117 | insert_data = [
118 |     (
119 |         exchange["exchangeId"],
120 |         exchange["name"],
121 |         int(exchange["rank"]),
122 |         (
123 |             float(exchange["percentTotalVolume"])
124 |             if exchange["percentTotalVolume"]
125 |             else None
126 |         ),
127 |         float(exchange["volumeUsd"]) if exchange["volumeUsd"] else None,
128 |         exchange["tradingPairs"],
129 |         exchange["socket"],
130 |         exchange["exchangeUrl"],
131 |         int(exchange["updated"]),
132 |     )
133 |     for exchange in data
134 | ]
135 | 
136 | duckdb_conn.executemany(insert_query, insert_data)
137 | 
138 | # Commit and close the connection
139 | duckdb_conn.commit()
140 | duckdb_conn.close()
141 | 
142 | # Local disk
143 | # Question: How do you read a CSV file from local disk and write it to a database?
144 | # Look up open function with csvreader for python
145 | 
146 | import csv
147 | 
148 | data_location = "./data/customers.csv"
149 | with open(data_location, "r", newline="") as csvfile:
150 |     csvreader = csv.reader(csvfile)
151 |     next(csvreader)  # Skip header row
152 |     for row in csvreader:
153 |         print(row)
154 | 
155 | # Web scraping
156 | # Questions: Use beatiful soup to scrape the below website and print all the links in that website
157 | # URL of the website to scrape
158 | 
159 | import requests
160 | from bs4 import BeautifulSoup
161 | 
162 | # URL of the website to scrape
163 | url = 'https://example.com'
164 | 
165 | # Send a GET request to the website
166 | response = requests.get(url)
167 | 
168 | # Parse the HTML content of the webpage
169 | soup = BeautifulSoup(response.text, 'html.parser')
170 | 
171 | # Example: Find and print all the links on the webpage
172 | for link in soup.find_all('a'):
173 |     print(link.get('href'))
174 | 
175 | 


--------------------------------------------------------------------------------
/data/customers.csv:
--------------------------------------------------------------------------------
  1 | customer_id,zipcode,city,state_code,datetime_created,datetime_updated
  2 | 1,14409,franca,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
  3 | 2,09790,sao bernardo do campo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
  4 | 3,01151,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
  5 | 4,08775,mogi das cruzes,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
  6 | 5,13056,campinas,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
  7 | 6,89254,jaragua do sul,SC,2017-10-18 00:00:00,2017-10-18 00:00:00
  8 | 7,04534,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
  9 | 8,35182,timoteo,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 10 | 9,81560,curitiba,PR,2017-10-18 00:00:00,2017-10-18 00:00:00
 11 | 10,30575,belo horizonte,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 12 | 11,39400,montes claros,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 13 | 12,20231,rio de janeiro,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
 14 | 13,18682,lencois paulista,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 15 | 14,05704,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 16 | 15,95110,caxias do sul,RS,2017-10-18 00:00:00,2017-10-18 00:00:00
 17 | 16,13412,piracicaba,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 18 | 17,22750,rio de janeiro,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
 19 | 18,07124,guarulhos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 20 | 19,05416,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 21 | 20,68485,pacaja,PA,2017-10-18 00:00:00,2017-10-18 00:00:00
 22 | 21,88034,florianopolis,SC,2017-10-18 00:00:00,2017-10-18 00:00:00
 23 | 22,74914,aparecida de goiania,GO,2017-10-18 00:00:00,2017-10-18 00:00:00
 24 | 23,05713,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 25 | 24,82820,curitiba,PR,2017-10-18 00:00:00,2017-10-18 00:00:00
 26 | 25,08225,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 27 | 26,09121,santo andre,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 28 | 27,74310,goiania,GO,2017-10-18 00:00:00,2017-10-18 00:00:00
 29 | 28,04571,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 30 | 29,29311,cachoeiro de itapemirim,ES,2017-10-18 00:00:00,2017-10-18 00:00:00
 31 | 30,05528,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 32 | 31,12235,sao jose dos campos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 33 | 32,18130,sao roque,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 34 | 33,42800,camacari,BA,2017-10-18 00:00:00,2017-10-18 00:00:00
 35 | 34,27525,resende,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
 36 | 35,81750,curitiba,PR,2017-10-18 00:00:00,2017-10-18 00:00:00
 37 | 36,13175,sumare,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 38 | 37,07170,guarulhos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 39 | 38,93415,novo hamburgo,RS,2017-10-18 00:00:00,2017-10-18 00:00:00
 40 | 39,65075,sao luis,MA,2017-10-18 00:00:00,2017-10-18 00:00:00
 41 | 40,88104,sao jose,SC,2017-10-18 00:00:00,2017-10-18 00:00:00
 42 | 41,07176,guarulhos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 43 | 42,35960,santa barbara,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 44 | 43,05727,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 45 | 44,07053,guarulhos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 46 | 45,14026,ribeirao preto,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 47 | 46,30320,belo horizonte,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 48 | 47,38300,ituiutaba,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 49 | 48,18740,taquarituba,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 50 | 49,83085,sao jose dos pinhais,PR,2017-10-18 00:00:00,2017-10-18 00:00:00
 51 | 50,89254,jaragua do sul,SC,2017-10-18 00:00:00,2017-10-18 00:00:00
 52 | 51,05351,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 53 | 52,39406,montes claros,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 54 | 53,14860,barrinha,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 55 | 54,21310,rio de janeiro,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
 56 | 55,23970,parati,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
 57 | 56,79804,dourados,MS,2017-10-18 00:00:00,2017-10-18 00:00:00
 58 | 57,05017,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 59 | 58,75388,trindade,GO,2017-10-18 00:00:00,2017-10-18 00:00:00
 60 | 59,85808,cascavel,PR,2017-10-18 00:00:00,2017-10-18 00:00:00
 61 | 60,60140,fortaleza,CE,2017-10-18 00:00:00,2017-10-18 00:00:00
 62 | 61,72270,brasilia,DF,2017-10-18 00:00:00,2017-10-18 00:00:00
 63 | 62,02075,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 64 | 63,96015,pelotas,RS,2017-10-18 00:00:00,2017-10-18 00:00:00
 65 | 64,90010,porto alegre,RS,2017-10-18 00:00:00,2017-10-18 00:00:00
 66 | 65,22440,rio de janeiro,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
 67 | 66,13323,salto,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 68 | 67,30190,belo horizonte,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 69 | 68,13212,jundiai,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 70 | 69,29307,cachoeiro de itapemirim,ES,2017-10-18 00:00:00,2017-10-18 00:00:00
 71 | 70,12280,cacapava,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 72 | 71,60336,fortaleza,CE,2017-10-18 00:00:00,2017-10-18 00:00:00
 73 | 72,11310,sao vicente,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 74 | 73,38408,uberlandia,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 75 | 74,37720,botelhos,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 76 | 75,24431,sao goncalo,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
 77 | 76,05890,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 78 | 77,03733,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 79 | 78,83709,araucaria,PR,2017-10-18 00:00:00,2017-10-18 00:00:00
 80 | 79,11347,sao vicente,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 81 | 80,26272,nova iguacu,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
 82 | 81,05415,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 83 | 82,59655,areia branca,RN,2017-10-18 00:00:00,2017-10-18 00:00:00
 84 | 83,04548,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 85 | 84,28010,campos dos goytacazes,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
 86 | 85,13573,sao carlos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 87 | 86,02175,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 88 | 87,37500,itajuba,MG,2017-10-18 00:00:00,2017-10-18 00:00:00
 89 | 88,90670,porto alegre,RS,2017-10-18 00:00:00,2017-10-18 00:00:00
 90 | 89,09890,sao bernardo do campo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 91 | 90,13321,salto,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 92 | 91,44380,cruz das almas,BA,2017-10-18 00:00:00,2017-10-18 00:00:00
 93 | 92,27700,vassouras,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
 94 | 93,44033,feira de santana,BA,2017-10-18 00:00:00,2017-10-18 00:00:00
 95 | 94,04537,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 96 | 95,71540,brasilia,DF,2017-10-18 00:00:00,2017-10-18 00:00:00
 97 | 96,13569,sao carlos,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 98 | 97,05565,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
 99 | 98,03636,sao paulo,SP,2017-10-18 00:00:00,2017-10-18 00:00:00
100 | 99,24120,niteroi,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
101 | 100,24120,niteroi,RJ,2017-10-18 00:00:00,2017-10-18 00:00:00
102 | 


--------------------------------------------------------------------------------
/1-basics-solutions.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Variable: A storage location identified by its name, containing some value.
  3 | 
  4 | # Value of 10 is assigned to variable a and 20 to variable b
  5 | a = 10
  6 | b = 20
  7 | 
  8 | # We can do any operation (arithmetic for numbers, string transformation for text) on variables
  9 | 
 10 | # Question: What is the result of a + b?
 11 | c = a + b
 12 | print(c)  # Will print 30
 13 | 
 14 | s = '  Some string '
 15 | # We can perform an operation on this string, for example, let's remove the empty spaces in front of and behind the string
 16 | 
 17 | # Question: How do you remove the empty spaces in front of and behind the string s?
 18 | print(s.strip())
 19 | 
 20 | # Data Structures are ways of representing data, each has its own pros and cons and places that they are the right fit.
 21 | 
 22 | ## List: A collection of elements that can be accessed by knowing the location (aka index) of the element
 23 | l = [1, 2, 3, 4]
 24 | 
 25 | # Question: How do you access the elements in index 0 and 3?
 26 | print(l[0])  # Will print 1
 27 | print(l[3])  # Will print 4
 28 | ## NOTE: lists retain the order of elements in it but dictionary doesn't
 29 | 
 30 | ## Dictionary: A collection of key-value pairs, where each key is mapped to a value using a hash function. Provides fast data retrieval based on keys.
 31 | d = {'a': 1, 'b': 2}
 32 | 
 33 | # Question: How do you access the values associated with keys 'a' and 'b'?
 34 | print(d.get('a'))  # Will print 1
 35 | print(d.get('b'))  # Will print 2
 36 | ## NOTE: The dictionary cannot have duplicate keys
 37 | 
 38 | ## Set: A collection of unique elements that do not allow duplicates
 39 | my_set = set()
 40 | my_set.add(10)
 41 | my_set.add(10)
 42 | my_set.add(10)
 43 | 
 44 | # Question: What will be the output of my_set?
 45 | print(my_set)  # This will only show 10, since the set only keeps unique values
 46 | 
 47 | ## Tuple: A collection of immutable (non-changeable) elements, tuples retain their order once created.
 48 | my_tuple = (1, 'hello', 3.14)
 49 | 
 50 | # Question: What is the value of my_tuple?
 51 | print(my_tuple)  # Output: (1, 'hello', 3.14)
 52 | 
 53 | ## Accessing elements by index
 54 | 
 55 | # Question: How do you access the elements in index 0 and 1 of my_tuple?
 56 | print(my_tuple[0])  # Output: 1
 57 | print(my_tuple[1])  # Output: 'hello'
 58 | 
 59 | ## Counting occurrences of an element
 60 | count_tuple = (1, 2, 3, 1, 1, 2)
 61 | 
 62 | # Question: How many times does the number 1 appear in count_tuple?
 63 | print(count_tuple.count(1))  # Output: 3
 64 | 
 65 | ## Finding the index of an element
 66 | 
 67 | # Question: What is the index of the first occurrence of the number 2 in count_tuple?
 68 | print(count_tuple.index(2))  # Output: 1
 69 | 
 70 | # Loop allows a specific chunk of code to be repeated a certain number of times
 71 | # Example: We can use a loop to print numbers 0 through 10
 72 | for i in range(11):
 73 |     print(i)
 74 | 
 75 | ## We can loop through our data structures as shown below
 76 | # Question: How do you loop through a list and print its elements?
 77 | for elt in l:
 78 |     print(elt)  # Or any operation you may want to do
 79 | ## We can do a similar loop for tuples and sets
 80 | 
 81 | ## Dictionary loop
 82 | # Question: How do you loop through a dictionary and print its keys and values?
 83 | for k, v in d.items():
 84 |     print(f'Key: {k}, Value: {v}')  # Print key and values in dictionary
 85 | 
 86 | ## Comprehension is a shorthand way of writing a loop
 87 | ## For example, we can use the below to multiply every element in list l with 2
 88 | print([elt*2 for elt in l])
 89 | 
 90 | # Functions: A block of code that can be re-used as needed. This allows for us to have logic defined in one place, making it easy to maintain and use.
 91 | ## For example, let's create a simple function that takes a list as an input and returns another list whose values are greater than 3
 92 | 
 93 | def gt_three(input_list):
 94 |     return [elt for elt in input_list if elt > 3]
 95 | ## NOTE: we use list comprehension with filtering in the above function
 96 | 
 97 | list_1 = [1, 2, 3, 4, 5, 6]
 98 | # Question: How do you use the gt_three function to filter elements greater than 3 from list_1?
 99 | print(gt_three(list_1))  # Will print [4, 5, 6]
100 | 
101 | list_2 = [1, 2, 3, 1, 1, 1]
102 | # Question: What will be the output of gt_three(list_2)?
103 | print(gt_three(list_2))  # Will print []
104 | 
105 | # Classes and Objects
106 | # Think of a class as a blueprint and objects as things created based on that blueprint
107 | # You can define classes in Python as shown below
108 | 
109 | class DataExtractor:
110 | 
111 |     def __init__(self, some_value):
112 |         self.some_value = some_value
113 | 
114 |     def get_connection(self):
115 |         # Some logic
116 |         # some_value is accessible using self.some_value
117 |         pass
118 | 
119 |     def close_connection(self):
120 |         # Some logic
121 |         # some_value is accessible using self.some_value
122 |         pass
123 | 
124 | # Question: How do you create a DataExtractor object and print its some_value attribute?
125 | de_object = DataExtractor(10)
126 | print(de_object.some_value)  # Will print 10
127 | 
128 | # Libraries are code that can be reused.
129 | 
130 | # Python comes with some standard libraries to do common operations, 
131 | # such as the datetime library to work with time (although there are better libraries)
132 | from datetime import datetime  # You can import library or your code from another file with the import statement
133 | 
134 | # Question: How do you print the current date in the format 'YYYY MM DD'?
135 | print(datetime.now().strftime('%Y %m %d'))  # We can use multiple such methods 
136 | 
137 | # Exception handling: When an error occurs, we need our code to gracefully handle it without just stopping. 
138 | ## Here is how we can handle errors when the program is running
139 | try:
140 |     # Code that might raise an exception
141 |     pass
142 | except Exception as e: 
143 |     # Code that runs if the exception occurs
144 |     pass
145 | else:
146 |     # Code that runs if no exception occurs
147 |     pass
148 | finally:
149 |     # Code that always runs, regardless of exceptions
150 |     pass
151 | 
152 | ## For example, let's consider exception handling on accessing an element that is not present in a list l
153 | l = [1, 2, 3, 4, 5]
154 | 
155 | # Question: How do you handle an IndexError when accessing an invalid index in a list?
156 | index = 10
157 | try:
158 |     # Attempt to access an element at an invalid index
159 |     element = l[index]
160 |     print(f"Element at index {index} is {element}")
161 | except IndexError:
162 |     print(f"Error: Index {index} is out of range for the list.")
163 | finally:
164 |     print("Execution completed.")
165 | # NOTE: in the except block its preferred to specify the exact erro/exception that you want to handle
166 | 


--------------------------------------------------------------------------------
/3-data-transform-solutions.py:
--------------------------------------------------------------------------------
  1 | print(
  2 |     "################################################################################"
  3 | )
  4 | print("Use standard python libraries to do the transformations")
  5 | print(
  6 |     "################################################################################"
  7 | )
  8 | import csv
  9 | 
 10 | # Question: How do you read data from a CSV file into a list of dictionaries?
 11 | data = []
 12 | with open("./data/sample_data.csv", "r", newline="") as csvfile:
 13 |     reader = csv.DictReader(csvfile)
 14 |     for row in reader:
 15 |         data.append(row)
 16 | 
 17 | # Question: How do you remove duplicate rows based on customer ID?
 18 | data_unique = []
 19 | customer_ids_seen = set()
 20 | for row in data:
 21 |     if row["Customer_ID"] not in customer_ids_seen:
 22 |         data_unique.append(row)
 23 |         customer_ids_seen.add(row["Customer_ID"])
 24 |     else:
 25 |         print(f'duplicate customer id {row["Customer_ID"]}')
 26 | 
 27 | # Question: How do you handle missing values by replacing them with 0?
 28 | for row in data_unique:
 29 |     if not row["Age"]:
 30 |         print(f'Customer {row["Customer_Name"]} does not have Age value')
 31 |         row["Age"] = 0
 32 |     if not row["Purchase_Amount"]:
 33 |         row["Purchase_Amount"] = 0.0
 34 | 
 35 | # Question: How do you remove outliers such as age > 100 or purchase amount > 1000?
 36 | data_cleaned = [
 37 |     row
 38 |     for row in data_unique
 39 |     if int(row["Age"]) <= 100 and float(row["Purchase_Amount"]) <= 1000
 40 | ]
 41 | 
 42 | # Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male)?
 43 | for row in data_cleaned:
 44 |     if row["Gender"] == "Female":
 45 |         row["Gender"] = 0
 46 |     elif row["Gender"] == "Male":
 47 |         row["Gender"] = 1
 48 | 
 49 | # Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns?
 50 | for row in data_cleaned:
 51 |     first_name, last_name = row["Customer_Name"].split(" ", 1)
 52 |     row["First_Name"] = first_name
 53 |     row["Last_Name"] = last_name
 54 |     del row["Customer_Name"]
 55 | 
 56 | # Question: How do you calculate the total purchase amount by Gender?
 57 | total_purchase_by_gender = {}
 58 | for row in data_cleaned:
 59 |     total_purchase_by_gender[row["Gender"]] += float(row["Purchase_Amount"])
 60 | 
 61 | # Question: How do you calculate the average purchase amount by Age group?
 62 | # assume age_groups is the grouping we want
 63 | # hint: Why do we convert to float?
 64 | age_groups = {"18-30": [], "31-40": [], "41-50": [], "51-60": [], "61-70": []}
 65 | for row in data_cleaned:
 66 |     age = int(row["Age"])
 67 |     if age <= 30:
 68 |         age_groups["18-30"].append(float(row["Purchase_Amount"]))
 69 |     elif age <= 40:
 70 |         age_groups["31-40"].append(float(row["Purchase_Amount"]))
 71 |     elif age <= 50:
 72 |         age_groups["41-50"].append(float(row["Purchase_Amount"]))
 73 |     elif age <= 60:
 74 |         age_groups["51-60"].append(float(row["Purchase_Amount"]))
 75 |     else:
 76 |         age_groups["61-70"].append(float(row["Purchase_Amount"]))
 77 | 
 78 | average_purchase_by_age_group = {
 79 |     group: sum(amounts) / len(amounts) for group, amounts in age_groups.items()
 80 | }
 81 | 
 82 | # Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group?
 83 | print("Total purchase amount by Gender:", total_purchase_by_gender)
 84 | print("Average purchase amount by Age group:", average_purchase_by_age_group)
 85 | 
 86 | print(
 87 |     "################################################################################"
 88 | )
 89 | print("Use DuckDB to do the transformations")
 90 | print(
 91 |     "################################################################################"
 92 | )
 93 | 
 94 | import duckdb
 95 | 
 96 | # Question: How do you connect to DuckDB and load data from a CSV file into a DuckDB table?
 97 | # Connect to DuckDB and load data
 98 | con = duckdb.connect(database=":memory:", read_only=False)
 99 | con.execute(
100 |     "CREATE TABLE data (Customer_ID INTEGER, Customer_Name VARCHAR, Age INTEGER, Gender VARCHAR, Purchase_Amount FLOAT, Purchase_Date DATE)"
101 | )
102 | 
103 | # Read data from CSV file into DuckDB table
104 | con.execute("COPY data FROM './data/sample_data.csv' WITH HEADER CSV")
105 | 
106 | # Question: How do you remove duplicate rows based on customer ID in DuckDB?
107 | con.execute("CREATE TABLE data_unique AS SELECT DISTINCT * FROM data")
108 | 
109 | # Question: How do you handle missing values by replacing them with 0 in DuckDB?
110 | con.execute(
111 |     "CREATE TABLE data_cleaned_missing AS SELECT \
112 |              Customer_ID, Customer_Name, \
113 |              COALESCE(Age, 0) AS Age, \
114 |              Gender, \
115 |              COALESCE(Purchase_Amount, 0.0) AS Purchase_Amount, \
116 |              Purchase_Date \
117 |              FROM data_unique"
118 | )
119 | 
120 | # Question: How do you remove outliers (e.g., age > 100 or purchase amount > 1000) in DuckDB?
121 | con.execute(
122 |     "CREATE TABLE data_cleaned_outliers AS SELECT * FROM data_cleaned_missing \
123 |              WHERE Age <= 100 AND Purchase_Amount <= 1000"
124 | )
125 | 
126 | # Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male) in DuckDB?
127 | con.execute(
128 |     "CREATE TABLE data_cleaned_gender AS SELECT *, \
129 |              CASE WHEN Gender = 'Female' THEN 0 ELSE 1 END AS Gender_Binary \
130 |              FROM data_cleaned_outliers"
131 | )
132 | 
133 | # Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns in DuckDB?
134 | con.execute(
135 |     "CREATE TABLE data_cleaned AS SELECT \
136 |              Customer_ID, \
137 |              SPLIT_PART(Customer_Name, ' ', 1) AS First_Name, \
138 |              SPLIT_PART(Customer_Name, ' ', 2) AS Last_Name, \
139 |              Age, Gender_Binary, Purchase_Amount, Purchase_Date \
140 |              FROM data_cleaned_gender"
141 | )
142 | 
143 | # Question: How do you calculate the total purchase amount by Gender in DuckDB?
144 | total_purchase_by_gender = con.execute(
145 |     "SELECT Gender_Binary, SUM(Purchase_Amount) AS Total_Purchase_Amount \
146 |                                         FROM data_cleaned_gender \
147 |                                         GROUP BY Gender_Binary"
148 | ).fetchall()
149 | 
150 | # Question: How do you calculate the average purchase amount by Age group in DuckDB?
151 | average_purchase_by_age_group = con.execute(
152 |     "SELECT CASE \
153 |                                              WHEN Age BETWEEN 18 AND 30 THEN '18-30' \
154 |                                              WHEN Age BETWEEN 31 AND 40 THEN '31-40' \
155 |                                              WHEN Age BETWEEN 41 AND 50 THEN '41-50' \
156 |                                              WHEN Age BETWEEN 51 AND 60 THEN '51-60' \
157 |                                              ELSE '61-70' END AS Age_Group, \
158 |                                              AVG(Purchase_Amount) AS Average_Purchase_Amount \
159 |                                              FROM data_cleaned \
160 |                                              GROUP BY Age_Group"
161 | ).fetchall()
162 | 
163 | # Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group in DuckDB?
164 | print("====================== Results ======================")
165 | print("Total purchase amount by Gender:")
166 | print(total_purchase_by_gender)
167 | print("Average purchase amount by Age group:")
168 | print(average_purchase_by_age_group)
169 | 


--------------------------------------------------------------------------------