├── orders data analysis.py └── sql_code.sql /orders data analysis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[76]: 5 | 6 | 7 | #import libraries 8 | #!pip install kaggle 9 | import kaggle 10 | 11 | !kaggle datasets download ankitbansal06/retail-orders -f orders.csv 12 | 13 | 14 | # In[77]: 15 | 16 | 17 | #extract file from zip file 18 | import zipfile 19 | zip_ref = zipfile.ZipFile('orders.csv.zip') 20 | zip_ref.extractall() # extract file to dir 21 | zip_ref.close() # close file 22 | 23 | 24 | # In[145]: 25 | 26 | 27 | #read data from the file and handle null values 28 | import pandas as pd 29 | df = pd.read_csv('orders.csv',na_values=['Not Available','unknown']) 30 | df['Ship Mode'].unique() 31 | 32 | 33 | # In[154]: 34 | 35 | 36 | #rename columns names ..make them lower case and replace space with underscore 37 | #df.rename(columns={'Order Id':'order_id', 'City':'city'}) 38 | #df.columns=df.columns.str.lower() 39 | #df.columns=df.columns.str.replace(' ','_') 40 | df.head(5) 41 | 42 | 43 | # In[159]: 44 | 45 | 46 | #derive new columns discount , sale price and profit 47 | #df['discount']=df['list_price']*df['discount_percent']*.01 48 | #df['sale_price']= df['list_price']-df['discount'] 49 | df['profit']=df['sale_price']-df['cost_price'] 50 | df 51 | 52 | 53 | # In[162]: 54 | 55 | 56 | #convert order date from object data type to datetime 57 | df['order_date']=pd.to_datetime(df['order_date'],format="%Y-%m-%d") 58 | 59 | 60 | # In[167]: 61 | 62 | 63 | #drop cost price list price and discount percent columns 64 | df.drop(columns=['list_price','cost_price','discount_percent'],inplace=True) 65 | 66 | 67 | # In[169]: 68 | 69 | 70 | #load the data into sql server using replace option 71 | import sqlalchemy as sal 72 | engine = sal.create_engine('mssql://ANKIT\SQLEXPRESS/master?driver=ODBC+DRIVER+17+FOR+SQL+SERVER') 73 | conn=engine.connect() 74 | 75 | 76 | # In[172]: 77 | 78 | 79 | #load the data into sql server using append option 80 | df.to_sql('df_orders', con=conn , index=False, if_exists = 'append') 81 | 82 | 83 | -------------------------------------------------------------------------------- /sql_code.sql: -------------------------------------------------------------------------------- 1 | --find top 10 highest reveue generating products 2 | select top 10 product_id,sum(sale_price) as sales 3 | from df_orders 4 | group by product_id 5 | order by sales desc 6 | 7 | 8 | 9 | 10 | --find top 5 highest selling products in each region 11 | with cte as ( 12 | select region,product_id,sum(sale_price) as sales 13 | from df_orders 14 | group by region,product_id) 15 | select * from ( 16 | select * 17 | , row_number() over(partition by region order by sales desc) as rn 18 | from cte) A 19 | where rn<=5 20 | 21 | 22 | 23 | --find month over month growth comparison for 2022 and 2023 sales eg : jan 2022 vs jan 2023 24 | with cte as ( 25 | select year(order_date) as order_year,month(order_date) as order_month, 26 | sum(sale_price) as sales 27 | from df_orders 28 | group by year(order_date),month(order_date) 29 | --order by year(order_date),month(order_date) 30 | ) 31 | select order_month 32 | , sum(case when order_year=2022 then sales else 0 end) as sales_2022 33 | , sum(case when order_year=2023 then sales else 0 end) as sales_2023 34 | from cte 35 | group by order_month 36 | order by order_month 37 | 38 | 39 | 40 | 41 | 42 | --for each category which month had highest sales 43 | with cte as ( 44 | select category,format(order_date,'yyyyMM') as order_year_month 45 | , sum(sale_price) as sales 46 | from df_orders 47 | group by category,format(order_date,'yyyyMM') 48 | --order by category,format(order_date,'yyyyMM') 49 | ) 50 | select * from ( 51 | select *, 52 | row_number() over(partition by category order by sales desc) as rn 53 | from cte 54 | ) a 55 | where rn=1 56 | 57 | 58 | 59 | 60 | 61 | 62 | --which sub category had highest growth by profit in 2023 compare to 2022 63 | with cte as ( 64 | select sub_category,year(order_date) as order_year, 65 | sum(sale_price) as sales 66 | from df_orders 67 | group by sub_category,year(order_date) 68 | --order by year(order_date),month(order_date) 69 | ) 70 | , cte2 as ( 71 | select sub_category 72 | , sum(case when order_year=2022 then sales else 0 end) as sales_2022 73 | , sum(case when order_year=2023 then sales else 0 end) as sales_2023 74 | from cte 75 | group by sub_category 76 | ) 77 | select top 1 * 78 | ,(sales_2023-sales_2022) 79 | from cte2 80 | order by (sales_2023-sales_2022) desc 81 | --------------------------------------------------------------------------------