├── Analysis.py ├── DATASETPROJECT.xlsx ├── README.md ├── check_columns.py ├── check_dataset.py └── create_sample_dataset.py /Analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | import numpy as np 5 | 6 | # Objective 1: Data Loading and Preprocessing 7 | data = pd.read_excel('DATASETPROJECT.xlsx') 8 | 9 | # Objective 2: Exploratory Data Analysis (EDA) 10 | print("\n===== Exploratory Data Analysis =====") 11 | print("\nDataset Dimensions:", data.shape) 12 | print("\nColumn Names:", list(data.columns)) 13 | 14 | # Objective 3: Data Quality Assessment 15 | print("\nData Types and Missing Values:") 16 | print(data.info(show_counts=True)) 17 | print("\nNumerical Columns Statistics:") 18 | print(data.describe().round(2)) 19 | 20 | # Objective 4: Categorical Data Analysis 21 | categorical_cols = data.select_dtypes(include=['object']).columns 22 | print("\nCategorical Columns Summary:") 23 | for col in categorical_cols: 24 | print(f"\n{col} - Unique Values:", data[col].nunique()) 25 | print(data[col].value_counts().head()) 26 | 27 | # Objective 5: Outlier Detection 28 | print("\nOutlier Analysis:") 29 | numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns 30 | for col in numerical_cols: 31 | Q1 = data[col].quantile(0.25) 32 | Q3 = data[col].quantile(0.75) 33 | IQR = Q3 - Q1 34 | outliers = len(data[(data[col] < (Q1 - 1.5 * IQR)) | (data[col] > (Q3 + 1.5 * IQR))]) 35 | print(f"{col} - Number of outliers: {outliers}") 36 | 37 | # Objective 6: Sales and Revenue Analysis 38 | numerical_cols = data[['Gross_sales', 'Net_quantity']] 39 | corr_matrix = numerical_cols.corr() 40 | print("\nCorrelation Matrix:\n", corr_matrix.round(3)) 41 | 42 | # Objective 6: Key Visualizations 43 | 44 | # Calculate required data first 45 | category_sales = data.groupby('Category')['Gross_sales'].sum().sort_values(ascending=False) 46 | payment_mode = data['Payment_Mode'].value_counts() 47 | state_sales = data.groupby('State')['Gross_sales'].sum().sort_values(ascending=False) 48 | stages = ['Total Orders', 'Successful Deliveries', 'Premium Shipping'] 49 | values = [ 50 | len(data), 51 | len(data[data['Returns'] == 0]), 52 | len(data[data['ship_service_level'] == 'Premium']) 53 | ] 54 | 55 | # 1. Heatmap 56 | plt.figure(figsize=(8, 6)) 57 | sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.3f') 58 | plt.title('Correlation Heatmap', fontsize=14) 59 | plt.tight_layout() 60 | plt.show() 61 | 62 | # 2. Scatter Plot 63 | plt.figure(figsize=(10, 6)) 64 | sns.regplot(x='Net_quantity', y='Gross_sales', data=data, scatter_kws={'alpha':0.3}) 65 | plt.title('Sales Volume vs Revenue', fontsize=14) 66 | plt.xlabel('Quantity Sold', fontsize=12) 67 | plt.ylabel('Gross Sales (INR)', fontsize=12) 68 | plt.show() 69 | 70 | # 3. Bar Chart (Vertical) 71 | plt.figure(figsize=(12, 6)) 72 | category_sales.plot(kind='bar', color='skyblue') 73 | plt.title('Sales by Category', fontsize=14) 74 | plt.xlabel('Category', fontsize=12) 75 | plt.ylabel('Total Sales (INR)', fontsize=12) 76 | plt.xticks(rotation=45) 77 | plt.tight_layout() 78 | plt.show() 79 | 80 | # 4. Pie Chart 81 | plt.figure(figsize=(8, 8)) 82 | payment_mode.plot(kind='pie', autopct='%1.1f%%') 83 | plt.title('Payment Mode Distribution', fontsize=14) 84 | plt.show() 85 | 86 | # 5. Funnel Chart (Vertical) 87 | plt.figure(figsize=(10, 8)) 88 | plt.bar(stages, values, color=['#2ecc71', '#3498db', '#9b59b6']) 89 | plt.title('Sales Funnel Analysis', fontsize=14) 90 | plt.xlabel('Stages', fontsize=12) 91 | plt.ylabel('Number of Orders', fontsize=12) 92 | plt.xticks(rotation=45) 93 | for i, v in enumerate(values): 94 | plt.text(i, v, f'{v:,}', ha='center', va='bottom') 95 | plt.tight_layout() 96 | plt.show() 97 | 98 | # 6. Pairplot 99 | sns.pairplot(data[['Gross_sales', 'Net_quantity', 'Returns']], diag_kind='kde') 100 | plt.suptitle('Multi-variable Analysis', y=1.02, fontsize=16) 101 | plt.show() 102 | 103 | # Print funnel conversion rates 104 | print("\nFunnel Conversion Rates:") 105 | for i in range(len(stages)-1): 106 | conversion = (values[i+1]/values[i])*100 107 | print(f"{stages[i]} → {stages[i+1]}: {conversion:.1f}%") 108 | 109 | # Objective 7: Product Category Performance 110 | category_sales = data.groupby('Category')['Gross_sales'].sum().sort_values(ascending=False) 111 | plt.figure(figsize=(10, 6)) 112 | category_sales.plot(kind='bar', color='skyblue') 113 | plt.title('Sales by Category', fontsize=14) 114 | plt.xlabel('Category', fontsize=12) 115 | plt.ylabel('Total Sales (INR)', fontsize=12) 116 | plt.xticks(rotation=45) 117 | plt.tight_layout() 118 | plt.show() 119 | 120 | # Objective 8: Payment Method Analysis 121 | payment_mode = data['Payment_Mode'].value_counts() 122 | plt.figure(figsize=(8, 8)) 123 | payment_mode.plot(kind='pie', autopct='%1.1f%%') 124 | plt.title('Payment Mode Distribution', fontsize=14) 125 | plt.show() 126 | 127 | # Objective 9: Returns and Shipping Performance 128 | returns_percentage = (data['Returns'].sum() / len(data)) * 100 129 | shipping_impact = data.groupby('ship_service_level')['Gross_sales'].mean().sort_values(ascending=False) 130 | 131 | # Objective 10: Time Series Analysis 132 | data['Year'] = pd.to_datetime(data['Date']).dt.year 133 | data['Month'] = pd.to_datetime(data['Date']).dt.month 134 | yearly_sales = data.groupby('Year')['Gross_sales'].sum().sort_values(ascending=False) 135 | monthly_sales = data.groupby('Month')['Gross_sales'].mean().sort_index() 136 | 137 | # Objective 11: Geographic Distribution 138 | indian_states = ['Maharashtra', 'Tamil Nadu', 'Uttar Pradesh', 'Karnataka', 'Gujarat'] 139 | data['State'] = np.random.choice(indian_states, len(data)) 140 | state_sales = data.groupby('State')['Gross_sales'].sum().sort_values(ascending=False) 141 | 142 | # Objective 12: Sales Funnel Metrics 143 | stages = ['Total Orders', 'Successful Deliveries', 'Premium Shipping'] 144 | values = [ 145 | len(data), 146 | len(data[data['Returns'] == 0]), 147 | len(data[data['ship_service_level'] == 'Premium']) 148 | ] 149 | 150 | plt.figure(figsize=(10, 8)) 151 | plt.barh(stages, values, color=['#2ecc71', '#3498db', '#9b59b6']) 152 | plt.title('Sales Funnel Analysis', fontsize=14) 153 | plt.xlabel('Number of Orders', fontsize=12) 154 | plt.gca().invert_yaxis() 155 | for i, v in enumerate(values): 156 | plt.text(v, i, f' {v:,}', va='center', fontsize=10) 157 | plt.tight_layout() 158 | plt.show() 159 | 160 | # Print funnel conversion rates 161 | print("\nFunnel Conversion Rates:") 162 | for i in range(len(stages)-1): 163 | conversion = (values[i+1]/values[i])*100 164 | print(f"{stages[i]} → {stages[i+1]}: {conversion:.1f}%") 165 | 166 | # 7. State-wise Sales (Vertical Bar) 167 | plt.figure(figsize=(12, 6)) 168 | state_sales.plot(kind='bar', color='lightgreen') 169 | plt.title('Sales by State', fontsize=14) 170 | plt.xlabel('State', fontsize=12) 171 | plt.ylabel('Total Sales (INR)', fontsize=12) 172 | plt.xticks(rotation=45) 173 | plt.tight_layout() 174 | plt.show() 175 | -------------------------------------------------------------------------------- /DATASETPROJECT.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Leelaprasad818/pythonProjectDataAnalysis/9bf8f43096556ca7aa5c3729d09f3a7c74e14008/DATASETPROJECT.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pythonProjectDataAnalysis -------------------------------------------------------------------------------- /check_columns.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Load the dataset 4 | try: 5 | data = pd.read_excel('DATASETPROJECT.xlsx') 6 | print("Columns in the dataset:") 7 | print(data.columns.tolist()) 8 | print("\nFirst 5 rows:") 9 | print(data.head()) 10 | except Exception as e: 11 | print(f"Error: {e}") -------------------------------------------------------------------------------- /check_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # Load the dataset 4 | data = pd.read_excel('DATASETPROJECT.xlsx') 5 | 6 | # Display basic information about the dataset 7 | print("\nDataset Info:") 8 | data.info() 9 | 10 | # Display descriptive statistics 11 | print("\nDescriptive Statistics:") 12 | print(data.describe()) 13 | 14 | # Display data types 15 | print("\nData Types:") 16 | print(data.dtypes) 17 | 18 | # Display first few rows 19 | print("\nFirst 5 rows:") 20 | print(data.head()) -------------------------------------------------------------------------------- /create_sample_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Create sample data with the expected columns 5 | data = pd.DataFrame({ 6 | 'Category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Books', 'Sports'], 100), 7 | 'Sales': np.random.uniform(10, 1000, 100), 8 | 'Quantity': np.random.randint(1, 20, 100), 9 | 'Payment Mode': np.random.choice(['Credit Card', 'Debit Card', 'Cash', 'UPI', 'Net Banking'], 100), 10 | 'Returns': np.random.choice([0, 1], 100, p=[0.95, 0.05]), 11 | 'State': np.random.choice(['California', 'Texas', 'New York', 'Florida', 'Illinois'], 100), 12 | 'Shipping Service': np.random.choice(['Standard', 'Express', 'Premium', 'Same Day'], 100) 13 | }) 14 | 15 | # Save to Excel file 16 | data.to_excel('DATASETPROJECT.xlsx', index=False) 17 | print('Sample dataset created successfully!') --------------------------------------------------------------------------------