├── dataset_link.txt ├── Questions.txt └── csv_to_sql.py /dataset_link.txt: -------------------------------------------------------------------------------- 1 | https://drive.google.com/drive/folders/12dvYxsUZ7XUwlrXn6NTz6wwt7-0mdHsV?usp=sharing 2 | -------------------------------------------------------------------------------- /Questions.txt: -------------------------------------------------------------------------------- 1 | Basic Queries 2 | 1. List all unique cities where customers are located. 3 | 2. Count the number of orders placed in 2017. 4 | 3. Find the total sales per category. 5 | 4. Calculate the percentage of orders that were paid in installments. 6 | 5. Count the number of customers from each state. 7 | 8 | Intermediate Queries 9 | 1. Calculate the number of orders per month in 2018. 10 | 2. Find the average number of products per order, grouped by customer city. 11 | 3. Calculate the percentage of total revenue contributed by each product category. 12 | 4. Identify the correlation between product price and the number of times a product has been purchased. 13 | 5. Calculate the total revenue generated by each seller, and rank them by revenue. 14 | 15 | Advanced Queries 16 | 1. Calculate the moving average of order values for each customer over their order history. 17 | 2. Calculate the cumulative sales per month for each year. 18 | 3. Calculate the year-over-year growth rate of total sales. 19 | 4. Calculate the retention rate of customers, defined as the percentage of customers who make another purchase within 6 months of their first purchase. 20 | 5. Identify the top 3 customers who spent the most money in each year. 21 | -------------------------------------------------------------------------------- /csv_to_sql.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import mysql.connector 3 | import os 4 | 5 | # List of CSV files and their corresponding table names 6 | csv_files = [ 7 | ('customers.csv', 'customers'), 8 | ('orders.csv', 'orders'), 9 | ('sales.csv', 'sales'), 10 | ('products.csv', 'products'), 11 | ('delivery.csv', 'delivery'), 12 | ('payments.csv', 'payments') # Added payments.csv for specific handling 13 | ] 14 | 15 | # Connect to the MySQL database 16 | conn = mysql.connector.connect( 17 | host='your_host', 18 | user='your_username', 19 | password='your_password', 20 | database='your_database' 21 | ) 22 | cursor = conn.cursor() 23 | 24 | # Folder containing the CSV files 25 | folder_path = 'path_to_your_folder' 26 | 27 | def get_sql_type(dtype): 28 | if pd.api.types.is_integer_dtype(dtype): 29 | return 'INT' 30 | elif pd.api.types.is_float_dtype(dtype): 31 | return 'FLOAT' 32 | elif pd.api.types.is_bool_dtype(dtype): 33 | return 'BOOLEAN' 34 | elif pd.api.types.is_datetime64_any_dtype(dtype): 35 | return 'DATETIME' 36 | else: 37 | return 'TEXT' 38 | 39 | for csv_file, table_name in csv_files: 40 | file_path = os.path.join(folder_path, csv_file) 41 | 42 | # Read the CSV file into a pandas DataFrame 43 | df = pd.read_csv(file_path) 44 | 45 | # Replace NaN with None to handle SQL NULL 46 | df = df.where(pd.notnull(df), None) 47 | 48 | # Debugging: Check for NaN values 49 | print(f"Processing {csv_file}") 50 | print(f"NaN values before replacement:\n{df.isnull().sum()}\n") 51 | 52 | # Clean column names 53 | df.columns = [col.replace(' ', '_').replace('-', '_').replace('.', '_') for col in df.columns] 54 | 55 | # Generate the CREATE TABLE statement with appropriate data types 56 | columns = ', '.join([f'`{col}` {get_sql_type(df[col].dtype)}' for col in df.columns]) 57 | create_table_query = f'CREATE TABLE IF NOT EXISTS `{table_name}` ({columns})' 58 | cursor.execute(create_table_query) 59 | 60 | # Insert DataFrame data into the MySQL table 61 | for _, row in df.iterrows(): 62 | # Convert row to tuple and handle NaN/None explicitly 63 | values = tuple(None if pd.isna(x) else x for x in row) 64 | sql = f"INSERT INTO `{table_name}` ({', '.join(['`' + col + '`' for col in df.columns])}) VALUES ({', '.join(['%s'] * len(row))})" 65 | cursor.execute(sql, values) 66 | 67 | # Commit the transaction for the current CSV file 68 | conn.commit() 69 | 70 | # Close the connection 71 | conn.close() 72 | --------------------------------------------------------------------------------