├── Analysis.py
├── DATASETPROJECT.xlsx
├── README.md
├── check_columns.py
├── check_dataset.py
└── create_sample_dataset.py


/Analysis.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import seaborn as sns
  4 | import numpy as np
  5 | 
  6 | # Objective 1: Data Loading and Preprocessing
  7 | data = pd.read_excel('DATASETPROJECT.xlsx')
  8 | 
  9 | # Objective 2: Exploratory Data Analysis (EDA)
 10 | print("\n===== Exploratory Data Analysis =====")
 11 | print("\nDataset Dimensions:", data.shape)
 12 | print("\nColumn Names:", list(data.columns))
 13 | 
 14 | # Objective 3: Data Quality Assessment
 15 | print("\nData Types and Missing Values:")
 16 | print(data.info(show_counts=True))
 17 | print("\nNumerical Columns Statistics:")
 18 | print(data.describe().round(2))
 19 | 
 20 | # Objective 4: Categorical Data Analysis
 21 | categorical_cols = data.select_dtypes(include=['object']).columns
 22 | print("\nCategorical Columns Summary:")
 23 | for col in categorical_cols:
 24 |     print(f"\n{col} - Unique Values:", data[col].nunique())
 25 |     print(data[col].value_counts().head())
 26 | 
 27 | # Objective 5: Outlier Detection
 28 | print("\nOutlier Analysis:")
 29 | numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
 30 | for col in numerical_cols:
 31 |     Q1 = data[col].quantile(0.25)
 32 |     Q3 = data[col].quantile(0.75)
 33 |     IQR = Q3 - Q1
 34 |     outliers = len(data[(data[col] < (Q1 - 1.5 * IQR)) | (data[col] > (Q3 + 1.5 * IQR))])
 35 |     print(f"{col} - Number of outliers: {outliers}")
 36 | 
 37 | # Objective 6: Sales and Revenue Analysis
 38 | numerical_cols = data[['Gross_sales', 'Net_quantity']]
 39 | corr_matrix = numerical_cols.corr()
 40 | print("\nCorrelation Matrix:\n", corr_matrix.round(3))
 41 | 
 42 | # Objective 6: Key Visualizations
 43 | 
 44 | # Calculate required data first
 45 | category_sales = data.groupby('Category')['Gross_sales'].sum().sort_values(ascending=False)
 46 | payment_mode = data['Payment_Mode'].value_counts()
 47 | state_sales = data.groupby('State')['Gross_sales'].sum().sort_values(ascending=False)
 48 | stages = ['Total Orders', 'Successful Deliveries', 'Premium Shipping']
 49 | values = [
 50 |     len(data),
 51 |     len(data[data['Returns'] == 0]),
 52 |     len(data[data['ship_service_level'] == 'Premium'])
 53 | ]
 54 | 
 55 | # 1. Heatmap
 56 | plt.figure(figsize=(8, 6))
 57 | sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.3f')
 58 | plt.title('Correlation Heatmap', fontsize=14)
 59 | plt.tight_layout()
 60 | plt.show()
 61 | 
 62 | # 2. Scatter Plot
 63 | plt.figure(figsize=(10, 6))
 64 | sns.regplot(x='Net_quantity', y='Gross_sales', data=data, scatter_kws={'alpha':0.3})
 65 | plt.title('Sales Volume vs Revenue', fontsize=14)
 66 | plt.xlabel('Quantity Sold', fontsize=12)
 67 | plt.ylabel('Gross Sales (INR)', fontsize=12)
 68 | plt.show()
 69 | 
 70 | # 3. Bar Chart (Vertical)
 71 | plt.figure(figsize=(12, 6))
 72 | category_sales.plot(kind='bar', color='skyblue')
 73 | plt.title('Sales by Category', fontsize=14)
 74 | plt.xlabel('Category', fontsize=12)
 75 | plt.ylabel('Total Sales (INR)', fontsize=12)
 76 | plt.xticks(rotation=45)
 77 | plt.tight_layout()
 78 | plt.show()
 79 | 
 80 | # 4. Pie Chart
 81 | plt.figure(figsize=(8, 8))
 82 | payment_mode.plot(kind='pie', autopct='%1.1f%%')
 83 | plt.title('Payment Mode Distribution', fontsize=14)
 84 | plt.show()
 85 | 
 86 | # 5. Funnel Chart (Vertical)
 87 | plt.figure(figsize=(10, 8))
 88 | plt.bar(stages, values, color=['#2ecc71', '#3498db', '#9b59b6'])
 89 | plt.title('Sales Funnel Analysis', fontsize=14)
 90 | plt.xlabel('Stages', fontsize=12)
 91 | plt.ylabel('Number of Orders', fontsize=12)
 92 | plt.xticks(rotation=45)
 93 | for i, v in enumerate(values):
 94 |     plt.text(i, v, f'{v:,}', ha='center', va='bottom')
 95 | plt.tight_layout()
 96 | plt.show()
 97 | 
 98 | # 6. Pairplot
 99 | sns.pairplot(data[['Gross_sales', 'Net_quantity', 'Returns']], diag_kind='kde')
100 | plt.suptitle('Multi-variable Analysis', y=1.02, fontsize=16)
101 | plt.show()
102 | 
103 | # Print funnel conversion rates
104 | print("\nFunnel Conversion Rates:")
105 | for i in range(len(stages)-1):
106 |     conversion = (values[i+1]/values[i])*100
107 |     print(f"{stages[i]} → {stages[i+1]}: {conversion:.1f}%")
108 | 
109 | # Objective 7: Product Category Performance
110 | category_sales = data.groupby('Category')['Gross_sales'].sum().sort_values(ascending=False)
111 | plt.figure(figsize=(10, 6))
112 | category_sales.plot(kind='bar', color='skyblue')
113 | plt.title('Sales by Category', fontsize=14)
114 | plt.xlabel('Category', fontsize=12)
115 | plt.ylabel('Total Sales (INR)', fontsize=12)
116 | plt.xticks(rotation=45)
117 | plt.tight_layout()
118 | plt.show()
119 | 
120 | # Objective 8: Payment Method Analysis
121 | payment_mode = data['Payment_Mode'].value_counts()
122 | plt.figure(figsize=(8, 8))
123 | payment_mode.plot(kind='pie', autopct='%1.1f%%')
124 | plt.title('Payment Mode Distribution', fontsize=14)
125 | plt.show()
126 | 
127 | # Objective 9: Returns and Shipping Performance
128 | returns_percentage = (data['Returns'].sum() / len(data)) * 100
129 | shipping_impact = data.groupby('ship_service_level')['Gross_sales'].mean().sort_values(ascending=False)
130 | 
131 | # Objective 10: Time Series Analysis
132 | data['Year'] = pd.to_datetime(data['Date']).dt.year
133 | data['Month'] = pd.to_datetime(data['Date']).dt.month
134 | yearly_sales = data.groupby('Year')['Gross_sales'].sum().sort_values(ascending=False)
135 | monthly_sales = data.groupby('Month')['Gross_sales'].mean().sort_index()
136 | 
137 | # Objective 11: Geographic Distribution
138 | indian_states = ['Maharashtra', 'Tamil Nadu', 'Uttar Pradesh', 'Karnataka', 'Gujarat']
139 | data['State'] = np.random.choice(indian_states, len(data))
140 | state_sales = data.groupby('State')['Gross_sales'].sum().sort_values(ascending=False)
141 | 
142 | # Objective 12: Sales Funnel Metrics
143 | stages = ['Total Orders', 'Successful Deliveries', 'Premium Shipping']
144 | values = [
145 |     len(data),
146 |     len(data[data['Returns'] == 0]),
147 |     len(data[data['ship_service_level'] == 'Premium'])
148 | ]
149 | 
150 | plt.figure(figsize=(10, 8))
151 | plt.barh(stages, values, color=['#2ecc71', '#3498db', '#9b59b6'])
152 | plt.title('Sales Funnel Analysis', fontsize=14)
153 | plt.xlabel('Number of Orders', fontsize=12)
154 | plt.gca().invert_yaxis()
155 | for i, v in enumerate(values):
156 |     plt.text(v, i, f' {v:,}', va='center', fontsize=10)
157 | plt.tight_layout()
158 | plt.show()
159 | 
160 | # Print funnel conversion rates
161 | print("\nFunnel Conversion Rates:")
162 | for i in range(len(stages)-1):
163 |     conversion = (values[i+1]/values[i])*100
164 |     print(f"{stages[i]} → {stages[i+1]}: {conversion:.1f}%")
165 | 
166 | # 7. State-wise Sales (Vertical Bar)
167 | plt.figure(figsize=(12, 6))
168 | state_sales.plot(kind='bar', color='lightgreen')
169 | plt.title('Sales by State', fontsize=14)
170 | plt.xlabel('State', fontsize=12)
171 | plt.ylabel('Total Sales (INR)', fontsize=12)
172 | plt.xticks(rotation=45)
173 | plt.tight_layout()
174 | plt.show()
175 | 


--------------------------------------------------------------------------------
/DATASETPROJECT.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leelaprasad818/pythonProjectDataAnalysis/9bf8f43096556ca7aa5c3729d09f3a7c74e14008/DATASETPROJECT.xlsx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pythonProjectDataAnalysis


--------------------------------------------------------------------------------
/check_columns.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Load the dataset
 4 | try:
 5 |     data = pd.read_excel('DATASETPROJECT.xlsx')
 6 |     print("Columns in the dataset:")
 7 |     print(data.columns.tolist())
 8 |     print("\nFirst 5 rows:")
 9 |     print(data.head())
10 | except Exception as e:
11 |     print(f"Error: {e}")


--------------------------------------------------------------------------------
/check_dataset.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | # Load the dataset
 4 | data = pd.read_excel('DATASETPROJECT.xlsx')
 5 | 
 6 | # Display basic information about the dataset
 7 | print("\nDataset Info:")
 8 | data.info()
 9 | 
10 | # Display descriptive statistics
11 | print("\nDescriptive Statistics:")
12 | print(data.describe())
13 | 
14 | # Display data types
15 | print("\nData Types:")
16 | print(data.dtypes)
17 | 
18 | # Display first few rows
19 | print("\nFirst 5 rows:")
20 | print(data.head())


--------------------------------------------------------------------------------
/create_sample_dataset.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Create sample data with the expected columns
 5 | data = pd.DataFrame({
 6 |     'Category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Books', 'Sports'], 100),
 7 |     'Sales': np.random.uniform(10, 1000, 100),
 8 |     'Quantity': np.random.randint(1, 20, 100),
 9 |     'Payment Mode': np.random.choice(['Credit Card', 'Debit Card', 'Cash', 'UPI', 'Net Banking'], 100),
10 |     'Returns': np.random.choice([0, 1], 100, p=[0.95, 0.05]),
11 |     'State': np.random.choice(['California', 'Texas', 'New York', 'Florida', 'Illinois'], 100),
12 |     'Shipping Service': np.random.choice(['Standard', 'Express', 'Premium', 'Same Day'], 100)
13 | })
14 | 
15 | # Save to Excel file
16 | data.to_excel('DATASETPROJECT.xlsx', index=False)
17 | print('Sample dataset created successfully!')


--------------------------------------------------------------------------------