├── 1.py └── code.ipynb /1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | # Load your dataset 7 | df = pd.read_csv("/Users/abhaykumargupta/Desktop/pythonCA2/Warehouse_and_Retail_Sales.csv") 8 | 9 | # Basic information 10 | print(" Dataset Overview") 11 | print(df.info()) 12 | print("/n First 5 Rows") 13 | print(df.head()) 14 | 15 | # Create Date column 16 | df['Date'] = pd.to_datetime(dict(year=df['YEAR'], month=df['MONTH'], day=1)) 17 | 18 | # Create Total Sales column 19 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 20 | 21 | # Check missing values 22 | print("/n Missing Values:") 23 | print(df.isnull().sum()) 24 | 25 | # Check for duplicates 26 | print("/n Duplicate Records:", df.duplicated().sum()) 27 | 28 | # Drop duplicates 29 | df.drop_duplicates(inplace=True) 30 | 31 | # Summary statistics 32 | print("/n Summary Statistics:") 33 | print(df.describe()) 34 | 35 | # Unique product and category info 36 | print("/n Unique Products:", df['ITEM DESCRIPTION'].nunique()) 37 | print(" Item Types:", df['ITEM TYPE'].unique()) 38 | 39 | # Visualization 1: Distribution of Total Sales 40 | plt.figure(figsize=(10, 5)) 41 | sns.histplot(df['Total Sales'], bins=50, kde=True, color='skyblue') 42 | plt.title("Distribution of Total Sales") 43 | plt.xlabel("Total Sales ($)") 44 | plt.ylabel("Frequency") 45 | plt.tight_layout() 46 | plt.show() 47 | 48 | # Visualization 2: Total Sales by Item Type 49 | plt.figure(figsize=(10, 5)) 50 | sns.barplot( 51 | data=df, 52 | x='ITEM TYPE', 53 | y='Total Sales', 54 | estimator=sum, 55 | hue='ITEM TYPE', 56 | palette='viridis', 57 | legend=False, 58 | errorbar=None 59 | ) 60 | plt.title("Total Sales by Item Type") 61 | plt.xlabel("Item Type") 62 | plt.ylabel("Total Sales ($)") 63 | plt.xticks(rotation=45) 64 | plt.tight_layout() 65 | plt.show() 66 | 67 | 68 | # Basic information 69 | print(" Dataset Overview") 70 | print(df.info()) 71 | print("\n First 5 Rows") 72 | print(df.head()) 73 | 74 | # Create Date column 75 | df['Date'] = pd.to_datetime(dict(year=df['YEAR'], month=df['MONTH'], day=1)) 76 | 77 | # Create Total Sales column 78 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 79 | 80 | # Check missing values 81 | print("\n Missing Values:") 82 | print(df.isnull().sum()) 83 | 84 | # Check for duplicates 85 | print("\n Duplicate Records:", df.duplicated().sum()) 86 | 87 | # Drop duplicates 88 | df.drop_duplicates(inplace=True) 89 | 90 | # Summary statistics 91 | print("\n Summary Statistics:") 92 | print(df.describe()) 93 | 94 | # Unique product and category info 95 | print("\n Unique Products:", df['ITEM DESCRIPTION'].nunique()) 96 | print(" Item Types:", df['ITEM TYPE'].unique()) 97 | 98 | # Visualization 1: Distribution of Total Sales 99 | plt.figure(figsize=(10, 5)) 100 | sns.histplot(df['Total Sales'], bins=50, kde=True, color='skyblue') 101 | plt.title("Distribution of Total Sales") 102 | plt.xlabel("Total Sales ($)") 103 | plt.ylabel("Frequency") 104 | plt.tight_layout() 105 | plt.show() 106 | # Visualization 2: Total Sales by Item Type 107 | plt.figure(figsize=(10, 5)) 108 | sns.barplot( 109 | data=df, 110 | x='ITEM TYPE', 111 | y='Total Sales', 112 | estimator=sum, 113 | hue='ITEM TYPE', 114 | palette='viridis', 115 | legend=False, 116 | errorbar=None 117 | ) 118 | plt.title("Total Sales by Item Type") 119 | plt.xlabel("Item Type") 120 | plt.ylabel("Total Sales ($)") 121 | plt.xticks(rotation=45) 122 | plt.tight_layout() 123 | plt.show() 124 | 125 | 126 | # Create Date column 127 | df['Date'] = pd.to_datetime(dict(year=df['YEAR'], month=df['MONTH'], day=1)) 128 | 129 | # Create Total Sales column 130 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 131 | 132 | # Check missing values 133 | print("\n Missing Values:") 134 | print(df.isnull().sum()) 135 | 136 | # Check for duplicates 137 | print("\n Duplicate Records:", df.duplicated().sum()) 138 | 139 | # Drop duplicates 140 | df.drop_duplicates(inplace=True) 141 | 142 | # Summary statistics 143 | print("\n Summary Statistics:") 144 | print(df.describe()) 145 | 146 | # Unique product and category info 147 | print("\n Unique Products:", df['ITEM DESCRIPTION'].nunique()) 148 | print(" Item Types:", df['ITEM TYPE'].unique()) 149 | 150 | # Visualization 1: Distribution of Total Sales 151 | plt.figure(figsize=(10, 5)) 152 | sns.histplot(df['Total Sales'], bins=50, kde=True, color='skyblue') 153 | plt.title("Distribution of Total Sales") 154 | plt.xlabel("Total Sales ($)") 155 | plt.ylabel("Frequency") 156 | plt.tight_layout() 157 | plt.show() 158 | 159 | # Visualization 2: Total Sales by Item Type 160 | plt.figure(figsize=(10, 5)) 161 | sns.barplot( 162 | data=df, 163 | x='ITEM TYPE', 164 | y='Total Sales', 165 | estimator=sum, 166 | hue='ITEM TYPE', 167 | palette='viridis', 168 | legend=False, 169 | errorbar=None 170 | ) 171 | plt.title("Total Sales by Item Type") 172 | plt.xlabel("Item Type") 173 | plt.ylabel("Total Sales ($)") 174 | plt.xticks(rotation=45) 175 | plt.tight_layout() 176 | plt.show() 177 | 178 | # Objective 1: Sales Trend Analysis Over Time 179 | # Create a proper datetime column from YEAR and MONTH 180 | df['Date'] = pd.to_datetime(dict(year=df['YEAR'], month=df['MONTH'], day=1)) 181 | 182 | # Calculate Total Sales 183 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 184 | 185 | # Group by Date (monthly) 186 | monthly_sales = df.groupby('Date')['Total Sales'].sum().reset_index() 187 | 188 | # Group by Year 189 | yearly_sales = df.groupby('YEAR')['Total Sales'].sum().reset_index() 190 | 191 | # Group by Quarter 192 | df['Quarter'] = df['Date'].dt.to_period('Q') 193 | quarterly_sales = df.groupby('Quarter')['Total Sales'].sum().reset_index() 194 | quarterly_sales['Quarter'] = quarterly_sales['Quarter'].astype(str) # Convert Period to string 195 | 196 | # Plot Monthly Sales Trend 197 | plt.figure(figsize=(12, 6)) 198 | sns.lineplot(data=monthly_sales, x='Date', y='Total Sales', marker='o', color='steelblue') 199 | plt.title("Monthly Sales Trend") 200 | plt.xlabel("Date") 201 | plt.ylabel("Total Sales ($)") 202 | plt.grid(True) 203 | plt.tight_layout() 204 | plt.show() 205 | 206 | # Plot Yearly Sales Summary 207 | plt.figure(figsize=(8, 5)) 208 | sns.barplot( 209 | data=yearly_sales, 210 | x='YEAR', 211 | y='Total Sales', 212 | hue='YEAR', 213 | palette='crest', 214 | legend=False, 215 | errorbar=None 216 | ) 217 | plt.title("Yearly Total Sales") 218 | 219 | plt.ylabel("Total Sales ($)") 220 | plt.tight_layout() 221 | plt.show() 222 | 223 | # Plot Quarterly Sales Trend 224 | plt.figure(figsize=(10, 5)) 225 | sns.lineplot(data=quarterly_sales, x='Quarter', y='Total Sales', marker='o', color='green') 226 | plt.title("Quarterly Sales Trend") 227 | plt.xlabel("Quarter") 228 | plt.ylabel("Total Sales ($)") 229 | plt.xticks(rotation=45) 230 | plt.tight_layout() 231 | plt.show() 232 | 233 | # Objective 2: Comparison Between Warehouse and Retail Sales 234 | # Create Total Sales column if not already present 235 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 236 | 237 | # Create a long-form DataFrame for Retail vs Warehouse 238 | sales_by_type = pd.melt( 239 | df, 240 | id_vars=['Date'] if 'Date' in df.columns else None, 241 | value_vars=['RETAIL SALES', 'WAREHOUSE SALES'], 242 | var_name='Sales Channel', 243 | value_name='Sales' 244 | ) 245 | 246 | # Clean channel names 247 | sales_by_type['Sales Channel'] = sales_by_type['Sales Channel'].str.replace(' SALES', '') 248 | 249 | # Total and average sales by channel 250 | channel_summary = sales_by_type.groupby('Sales Channel')['Sales'].agg(['sum', 'mean']).reset_index() 251 | print("Sales Summary (Total & Average):") 252 | print(channel_summary) 253 | 254 | # Bar Plot: Total Sales by Channel 255 | plt.figure(figsize=(6, 5)) 256 | sns.barplot( 257 | data=channel_summary, 258 | x='Sales Channel', 259 | y='sum', 260 | hue='Sales Channel', 261 | palette='Set2', 262 | legend=False, 263 | errorbar=None 264 | ) 265 | plt.title("Total Sales by Channel") 266 | plt.ylabel("Total Sales ($)") 267 | plt.tight_layout() 268 | plt.show() 269 | 270 | 271 | # Pie Chart: Contribution of Each Channel 272 | plt.figure(figsize=(6, 6)) 273 | plt.pie( 274 | channel_summary['sum'], 275 | labels=channel_summary['Sales Channel'], 276 | autopct='%1.1f%%', 277 | colors=sns.color_palette('Set2') 278 | ) 279 | plt.title("Sales Contribution: Retail vs Warehouse") 280 | plt.tight_layout() 281 | plt.show() 282 | 283 | # Objective 3: Top Performing Products 284 | 285 | # Create Total Sales column (if not already added) 286 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 287 | 288 | # 1️⃣ Top Products by Total Sales 289 | top_products_total = df.groupby('ITEM TYPE')['Total Sales'].sum().sort_values(ascending=False).head(10).reset_index() 290 | 291 | # 2️⃣ Top Products by Retail Sales 292 | top_products_retail = df.groupby('ITEM TYPE')['RETAIL SALES'].sum().sort_values(ascending=False).head(10).reset_index() 293 | 294 | # 3️⃣ Top Products by Warehouse Sales 295 | top_products_warehouse = df.groupby('ITEM TYPE')['WAREHOUSE SALES'].sum().sort_values(ascending=False).head(10).reset_index() 296 | 297 | # 🔹 Horizontal Bar Plot for Top Products by Total Sales 298 | plt.figure(figsize=(10, 6)) 299 | sns.barplot( 300 | data=top_products_total, 301 | y='ITEM TYPE', 302 | x='Total Sales', 303 | hue='ITEM TYPE', 304 | palette='Blues_d', 305 | legend=False, 306 | errorbar=None # remove confidence intervals 307 | ) 308 | plt.title("Top 10 Products by Total Sales") 309 | plt.xlabel("Total Sales ($)") 310 | plt.ylabel("Product") 311 | plt.tight_layout() 312 | plt.show() 313 | 314 | # 🔹 Horizontal Bar Plot for Top Products by Retail Sales 315 | plt.figure(figsize=(10, 6)) 316 | sns.barplot( 317 | data=top_products_retail, 318 | y='ITEM TYPE', 319 | x='RETAIL SALES', 320 | hue='ITEM TYPE', 321 | palette='Greens_d', 322 | legend=False, 323 | errorbar=None 324 | ) 325 | plt.title("Top 10 Products by Retail Sales") 326 | plt.xlabel("Retail Sales ($)") 327 | plt.ylabel("Product") 328 | plt.tight_layout() 329 | plt.show() 330 | 331 | # 🔹 Horizontal Bar Plot for Top Products by Warehouse Sales 332 | plt.figure(figsize=(10, 6)) 333 | sns.barplot( 334 | data=top_products_warehouse, 335 | y='ITEM TYPE', 336 | x='WAREHOUSE SALES', 337 | hue='ITEM TYPE', 338 | palette='Oranges_d', 339 | legend=False, 340 | errorbar=None 341 | ) 342 | plt.title("Top 10 Products by Warehouse Sales") 343 | plt.xlabel("Warehouse Sales ($)") 344 | plt.ylabel("Product") 345 | plt.tight_layout() 346 | plt.show() 347 | 348 | # Objective 4: Correlation Between Product Price and Sales Volume 349 | 350 | # Filter out rows with 0 transfers to avoid divide-by-zero 351 | df_valid = df[df['RETAIL TRANSFERS'] > 0].copy() 352 | 353 | # Calculate approximate price per unit 354 | df_valid['PRICE PER UNIT'] = df_valid['RETAIL SALES'] / df_valid['RETAIL TRANSFERS'] 355 | 356 | # Display correlation 357 | correlation = df_valid[['PRICE PER UNIT', 'RETAIL TRANSFERS']].corr() 358 | print("🔗 Correlation Matrix:") 359 | print(correlation) 360 | 361 | # Scatter plot with regression line 362 | plt.figure(figsize=(8, 6)) 363 | sns.regplot(data=df_valid, x='PRICE PER UNIT', y='RETAIL TRANSFERS', scatter_kws={'alpha':0.5}) 364 | plt.title("Correlation Between Product Price and Quantity Sold") 365 | plt.xlabel("Price Per Unit ($)") 366 | plt.ylabel("Quantity Sold (Retail Transfers)") 367 | plt.tight_layout() 368 | plt.show() 369 | 370 | # Objective 5: Peak Sales Months 371 | 372 | # Create datetime column 373 | df['DATE'] = pd.to_datetime(df['YEAR'].astype(str) + '-' + df['MONTH'].astype(str) + '-01') 374 | 375 | # Extract month name for visualization 376 | df['MONTH_NAME'] = df['DATE'].dt.strftime('%B') 377 | 378 | # Calculate total sales 379 | df['TOTAL SALES'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 380 | 381 | # Order the months 382 | month_order = ['January', 'February', 'March', 'April', 'May', 'June', 383 | 'July', 'August', 'September', 'October', 'November', 'December'] 384 | 385 | # Group and sort data 386 | monthly_sales = df.groupby('MONTH_NAME')['TOTAL SALES'].sum().reindex(month_order) 387 | 388 | # Plot the result without emoji and FutureWarning 389 | plt.figure(figsize=(10, 6)) 390 | sns.barplot(x=monthly_sales.index, y=monthly_sales.values) 391 | plt.xticks(rotation=45) 392 | plt.title("Total Sales by Month") 393 | plt.xlabel("Month") 394 | plt.ylabel("Total Sales") 395 | plt.tight_layout() 396 | plt.show() 397 | -------------------------------------------------------------------------------- /code.ipynb: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | # Load your dataset 7 | df = pd.read_csv("/Users/abhaykumargupta/Desktop/pythonCA2/Warehouse_and_Retail_Sales.csv") 8 | 9 | # Basic information 10 | print(" Dataset Overview") 11 | print(df.info()) 12 | print("/n First 5 Rows") 13 | print(df.head()) 14 | 15 | # Create Date column 16 | df['Date'] = pd.to_datetime(dict(year=df['YEAR'], month=df['MONTH'], day=1)) 17 | 18 | # Create Total Sales column 19 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 20 | 21 | # Check missing values 22 | print("/n Missing Values:") 23 | print(df.isnull().sum()) 24 | 25 | # Check for duplicates 26 | print("/n Duplicate Records:", df.duplicated().sum()) 27 | 28 | # Drop duplicates 29 | df.drop_duplicates(inplace=True) 30 | 31 | # Summary statistics 32 | print("/n Summary Statistics:") 33 | print(df.describe()) 34 | 35 | # Unique product and category info 36 | print("/n Unique Products:", df['ITEM DESCRIPTION'].nunique()) 37 | print(" Item Types:", df['ITEM TYPE'].unique()) 38 | 39 | # Visualization 1: Distribution of Total Sales 40 | plt.figure(figsize=(10, 5)) 41 | sns.histplot(df['Total Sales'], bins=50, kde=True, color='skyblue') 42 | plt.title("Distribution of Total Sales") 43 | plt.xlabel("Total Sales ($)") 44 | plt.ylabel("Frequency") 45 | plt.tight_layout() 46 | plt.show() 47 | 48 | # Visualization 2: Total Sales by Item Type 49 | plt.figure(figsize=(10, 5)) 50 | sns.barplot( 51 | data=df, 52 | x='ITEM TYPE', 53 | y='Total Sales', 54 | estimator=sum, 55 | hue='ITEM TYPE', 56 | palette='viridis', 57 | legend=False, 58 | errorbar=None 59 | ) 60 | plt.title("Total Sales by Item Type") 61 | plt.xlabel("Item Type") 62 | plt.ylabel("Total Sales ($)") 63 | plt.xticks(rotation=45) 64 | plt.tight_layout() 65 | plt.show() 66 | 67 | 68 | # Basic information 69 | print(" Dataset Overview") 70 | print(df.info()) 71 | print("\n First 5 Rows") 72 | print(df.head()) 73 | 74 | # Create Date column 75 | df['Date'] = pd.to_datetime(dict(year=df['YEAR'], month=df['MONTH'], day=1)) 76 | 77 | # Create Total Sales column 78 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 79 | 80 | # Check missing values 81 | print("\n Missing Values:") 82 | print(df.isnull().sum()) 83 | 84 | # Check for duplicates 85 | print("\n Duplicate Records:", df.duplicated().sum()) 86 | 87 | # Drop duplicates 88 | df.drop_duplicates(inplace=True) 89 | 90 | # Summary statistics 91 | print("\n Summary Statistics:") 92 | print(df.describe()) 93 | 94 | # Unique product and category info 95 | print("\n Unique Products:", df['ITEM DESCRIPTION'].nunique()) 96 | print(" Item Types:", df['ITEM TYPE'].unique()) 97 | 98 | # Visualization 1: Distribution of Total Sales 99 | plt.figure(figsize=(10, 5)) 100 | sns.histplot(df['Total Sales'], bins=50, kde=True, color='skyblue') 101 | plt.title("Distribution of Total Sales") 102 | plt.xlabel("Total Sales ($)") 103 | plt.ylabel("Frequency") 104 | plt.tight_layout() 105 | plt.show() 106 | # Visualization 2: Total Sales by Item Type 107 | plt.figure(figsize=(10, 5)) 108 | sns.barplot( 109 | data=df, 110 | x='ITEM TYPE', 111 | y='Total Sales', 112 | estimator=sum, 113 | hue='ITEM TYPE', 114 | palette='viridis', 115 | legend=False, 116 | errorbar=None 117 | ) 118 | plt.title("Total Sales by Item Type") 119 | plt.xlabel("Item Type") 120 | plt.ylabel("Total Sales ($)") 121 | plt.xticks(rotation=45) 122 | plt.tight_layout() 123 | plt.show() 124 | 125 | 126 | # Create Date column 127 | df['Date'] = pd.to_datetime(dict(year=df['YEAR'], month=df['MONTH'], day=1)) 128 | 129 | # Create Total Sales column 130 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 131 | 132 | # Check missing values 133 | print("\n Missing Values:") 134 | print(df.isnull().sum()) 135 | 136 | # Check for duplicates 137 | print("\n Duplicate Records:", df.duplicated().sum()) 138 | 139 | # Drop duplicates 140 | df.drop_duplicates(inplace=True) 141 | 142 | # Summary statistics 143 | print("\n Summary Statistics:") 144 | print(df.describe()) 145 | 146 | # Unique product and category info 147 | print("\n Unique Products:", df['ITEM DESCRIPTION'].nunique()) 148 | print(" Item Types:", df['ITEM TYPE'].unique()) 149 | 150 | # Visualization 1: Distribution of Total Sales 151 | plt.figure(figsize=(10, 5)) 152 | sns.histplot(df['Total Sales'], bins=50, kde=True, color='skyblue') 153 | plt.title("Distribution of Total Sales") 154 | plt.xlabel("Total Sales ($)") 155 | plt.ylabel("Frequency") 156 | plt.tight_layout() 157 | plt.show() 158 | 159 | # Visualization 2: Total Sales by Item Type 160 | plt.figure(figsize=(10, 5)) 161 | sns.barplot( 162 | data=df, 163 | x='ITEM TYPE', 164 | y='Total Sales', 165 | estimator=sum, 166 | hue='ITEM TYPE', 167 | palette='viridis', 168 | legend=False, 169 | errorbar=None 170 | ) 171 | plt.title("Total Sales by Item Type") 172 | plt.xlabel("Item Type") 173 | plt.ylabel("Total Sales ($)") 174 | plt.xticks(rotation=45) 175 | plt.tight_layout() 176 | plt.show() 177 | 178 | # Objective 1: Sales Trend Analysis Over Time 179 | # Create a proper datetime column from YEAR and MONTH 180 | df['Date'] = pd.to_datetime(dict(year=df['YEAR'], month=df['MONTH'], day=1)) 181 | 182 | # Calculate Total Sales 183 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 184 | 185 | # Group by Date (monthly) 186 | monthly_sales = df.groupby('Date')['Total Sales'].sum().reset_index() 187 | 188 | # Group by Year 189 | yearly_sales = df.groupby('YEAR')['Total Sales'].sum().reset_index() 190 | 191 | # Group by Quarter 192 | df['Quarter'] = df['Date'].dt.to_period('Q') 193 | quarterly_sales = df.groupby('Quarter')['Total Sales'].sum().reset_index() 194 | quarterly_sales['Quarter'] = quarterly_sales['Quarter'].astype(str) # Convert Period to string 195 | 196 | # Plot Monthly Sales Trend 197 | plt.figure(figsize=(12, 6)) 198 | sns.lineplot(data=monthly_sales, x='Date', y='Total Sales', marker='o', color='steelblue') 199 | plt.title("Monthly Sales Trend") 200 | plt.xlabel("Date") 201 | plt.ylabel("Total Sales ($)") 202 | plt.grid(True) 203 | plt.tight_layout() 204 | plt.show() 205 | 206 | # Plot Yearly Sales Summary 207 | plt.figure(figsize=(8, 5)) 208 | sns.barplot( 209 | data=yearly_sales, 210 | x='YEAR', 211 | y='Total Sales', 212 | hue='YEAR', 213 | palette='crest', 214 | legend=False, 215 | errorbar=None 216 | ) 217 | plt.title("Yearly Total Sales") 218 | 219 | plt.ylabel("Total Sales ($)") 220 | plt.tight_layout() 221 | plt.show() 222 | 223 | # Plot Quarterly Sales Trend 224 | plt.figure(figsize=(10, 5)) 225 | sns.lineplot(data=quarterly_sales, x='Quarter', y='Total Sales', marker='o', color='green') 226 | plt.title("Quarterly Sales Trend") 227 | plt.xlabel("Quarter") 228 | plt.ylabel("Total Sales ($)") 229 | plt.xticks(rotation=45) 230 | plt.tight_layout() 231 | plt.show() 232 | 233 | # Objective 2: Comparison Between Warehouse and Retail Sales 234 | # Create Total Sales column if not already present 235 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 236 | 237 | # Create a long-form DataFrame for Retail vs Warehouse 238 | sales_by_type = pd.melt( 239 | df, 240 | id_vars=['Date'] if 'Date' in df.columns else None, 241 | value_vars=['RETAIL SALES', 'WAREHOUSE SALES'], 242 | var_name='Sales Channel', 243 | value_name='Sales' 244 | ) 245 | 246 | # Clean channel names 247 | sales_by_type['Sales Channel'] = sales_by_type['Sales Channel'].str.replace(' SALES', '') 248 | 249 | # Total and average sales by channel 250 | channel_summary = sales_by_type.groupby('Sales Channel')['Sales'].agg(['sum', 'mean']).reset_index() 251 | print("Sales Summary (Total & Average):") 252 | print(channel_summary) 253 | 254 | # Bar Plot: Total Sales by Channel 255 | plt.figure(figsize=(6, 5)) 256 | sns.barplot( 257 | data=channel_summary, 258 | x='Sales Channel', 259 | y='sum', 260 | hue='Sales Channel', 261 | palette='Set2', 262 | legend=False, 263 | errorbar=None 264 | ) 265 | plt.title("Total Sales by Channel") 266 | plt.ylabel("Total Sales ($)") 267 | plt.tight_layout() 268 | plt.show() 269 | 270 | 271 | # Pie Chart: Contribution of Each Channel 272 | plt.figure(figsize=(6, 6)) 273 | plt.pie( 274 | channel_summary['sum'], 275 | labels=channel_summary['Sales Channel'], 276 | autopct='%1.1f%%', 277 | colors=sns.color_palette('Set2') 278 | ) 279 | plt.title("Sales Contribution: Retail vs Warehouse") 280 | plt.tight_layout() 281 | plt.show() 282 | 283 | # Objective 3: Top Performing Products 284 | 285 | # Create Total Sales column (if not already added) 286 | df['Total Sales'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 287 | 288 | # 1️⃣ Top Products by Total Sales 289 | top_products_total = df.groupby('ITEM TYPE')['Total Sales'].sum().sort_values(ascending=False).head(10).reset_index() 290 | 291 | # 2️⃣ Top Products by Retail Sales 292 | top_products_retail = df.groupby('ITEM TYPE')['RETAIL SALES'].sum().sort_values(ascending=False).head(10).reset_index() 293 | 294 | # 3️⃣ Top Products by Warehouse Sales 295 | top_products_warehouse = df.groupby('ITEM TYPE')['WAREHOUSE SALES'].sum().sort_values(ascending=False).head(10).reset_index() 296 | 297 | # 🔹 Horizontal Bar Plot for Top Products by Total Sales 298 | plt.figure(figsize=(10, 6)) 299 | sns.barplot( 300 | data=top_products_total, 301 | y='ITEM TYPE', 302 | x='Total Sales', 303 | hue='ITEM TYPE', 304 | palette='Blues_d', 305 | legend=False, 306 | errorbar=None # remove confidence intervals 307 | ) 308 | plt.title("Top 10 Products by Total Sales") 309 | plt.xlabel("Total Sales ($)") 310 | plt.ylabel("Product") 311 | plt.tight_layout() 312 | plt.show() 313 | 314 | # 🔹 Horizontal Bar Plot for Top Products by Retail Sales 315 | plt.figure(figsize=(10, 6)) 316 | sns.barplot( 317 | data=top_products_retail, 318 | y='ITEM TYPE', 319 | x='RETAIL SALES', 320 | hue='ITEM TYPE', 321 | palette='Greens_d', 322 | legend=False, 323 | errorbar=None 324 | ) 325 | plt.title("Top 10 Products by Retail Sales") 326 | plt.xlabel("Retail Sales ($)") 327 | plt.ylabel("Product") 328 | plt.tight_layout() 329 | plt.show() 330 | 331 | # 🔹 Horizontal Bar Plot for Top Products by Warehouse Sales 332 | plt.figure(figsize=(10, 6)) 333 | sns.barplot( 334 | data=top_products_warehouse, 335 | y='ITEM TYPE', 336 | x='WAREHOUSE SALES', 337 | hue='ITEM TYPE', 338 | palette='Oranges_d', 339 | legend=False, 340 | errorbar=None 341 | ) 342 | plt.title("Top 10 Products by Warehouse Sales") 343 | plt.xlabel("Warehouse Sales ($)") 344 | plt.ylabel("Product") 345 | plt.tight_layout() 346 | plt.show() 347 | 348 | # Objective 4: Correlation Between Product Price and Sales Volume 349 | 350 | # Filter out rows with 0 transfers to avoid divide-by-zero 351 | df_valid = df[df['RETAIL TRANSFERS'] > 0].copy() 352 | 353 | # Calculate approximate price per unit 354 | df_valid['PRICE PER UNIT'] = df_valid['RETAIL SALES'] / df_valid['RETAIL TRANSFERS'] 355 | 356 | # Display correlation 357 | correlation = df_valid[['PRICE PER UNIT', 'RETAIL TRANSFERS']].corr() 358 | print("🔗 Correlation Matrix:") 359 | print(correlation) 360 | 361 | # Scatter plot with regression line 362 | plt.figure(figsize=(8, 6)) 363 | sns.regplot(data=df_valid, x='PRICE PER UNIT', y='RETAIL TRANSFERS', scatter_kws={'alpha':0.5}) 364 | plt.title("Correlation Between Product Price and Quantity Sold") 365 | plt.xlabel("Price Per Unit ($)") 366 | plt.ylabel("Quantity Sold (Retail Transfers)") 367 | plt.tight_layout() 368 | plt.show() 369 | 370 | # Objective 5: Peak Sales Months 371 | 372 | # Create datetime column 373 | df['DATE'] = pd.to_datetime(df['YEAR'].astype(str) + '-' + df['MONTH'].astype(str) + '-01') 374 | 375 | # Extract month name for visualization 376 | df['MONTH_NAME'] = df['DATE'].dt.strftime('%B') 377 | 378 | # Calculate total sales 379 | df['TOTAL SALES'] = df['RETAIL SALES'] + df['WAREHOUSE SALES'] 380 | 381 | # Order the months 382 | month_order = ['January', 'February', 'March', 'April', 'May', 'June', 383 | 'July', 'August', 'September', 'October', 'November', 'December'] 384 | 385 | # Group and sort data 386 | monthly_sales = df.groupby('MONTH_NAME')['TOTAL SALES'].sum().reindex(month_order) 387 | 388 | # Plot the result without emoji and FutureWarning 389 | plt.figure(figsize=(10, 6)) 390 | sns.barplot(x=monthly_sales.index, y=monthly_sales.values) 391 | plt.xticks(rotation=45) 392 | plt.title("Total Sales by Month") 393 | plt.xlabel("Month") 394 | plt.ylabel("Total Sales") 395 | plt.tight_layout() 396 | plt.show() 397 | --------------------------------------------------------------------------------