├── CA2 python.docx ├── README.md ├── ca2.py └── online_shoppers_intention.csv /CA2 python.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sanchichauhan/PYTHON-PROJECT-/37b8fd838d9b5520fd3e75ec0fc666514058d49f/CA2 python.docx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /ca2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | from scipy import stats 5 | 6 | 7 | # Load the dataset 8 | df = pd.read_csv(r"C:\Users\acer\Downloads\online+shoppers+purchasing+intention+dataset (1)\online_shoppers_intention.csv") 9 | 10 | # 1. Shape of the dataset 11 | print(" Shape of the dataset:", df.shape) 12 | 13 | # 2. Column names 14 | print("\n Column names:") 15 | print(df.columns.tolist()) 16 | 17 | # 3. Data types of each column 18 | print("\n Data types:") 19 | print(df.dtypes) 20 | 21 | # 4. Check for missing values 22 | print("\n Missing values per column:") 23 | print(df.isnull().sum()) 24 | 25 | # 5. Show first 5 rows as sample 26 | print("\n First 5 rows of the dataset:") 27 | print(df.head()) 28 | 29 | # 6. Summary statistics for numeric columns 30 | print("\n Summary statistics:") 31 | print(df.describe()) 32 | 33 | 34 | # 7. Count of unique values per column 35 | print("\n Unique value counts per column:") 36 | print(df.nunique()) 37 | 38 | # 8. Count of each class in target variable (Revenue) 39 | print("\n Target variable class distribution:") 40 | print(df['Revenue'].value_counts()) 41 | 42 | sns.set(style="whitegrid") 43 | 44 | sns.set(style="whitegrid") 45 | 46 | sns.set(style="whitegrid") 47 | 48 | # 🔥 Fig 0: Correlation Heatmap for Numeric Features 49 | plt.figure(figsize=(12, 10)) 50 | numeric_df = df.select_dtypes(include='number') # Select only numeric columns 51 | corr_matrix = numeric_df.corr() 52 | sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}) 53 | plt.title('Fig 0: Heatmap of Feature Correlations') 54 | plt.tight_layout() 55 | plt.show() 56 | 57 | # 🔹 Fig 1: Countplot - Number of Sessions per Month by Revenue 58 | plt.figure(figsize=(10, 6)) 59 | sns.countplot(data=df, x='Month', hue='Revenue', palette='Set2', order=[ 60 | 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) 61 | plt.title('Fig 1: Sessions per Month by Revenue') 62 | plt.ylabel('Number of Sessions') 63 | plt.xlabel('Month') 64 | plt.xticks(rotation=45) 65 | plt.tight_layout() 66 | plt.show() 67 | 68 | # 🔹 Fig 2: Box Plot of PageValues by Revenue 69 | plt.figure(figsize=(8, 6)) 70 | sns.boxplot(data=df, x='Revenue', y='PageValues', width=0.5, fliersize=3) 71 | plt.title('Fig 2: PageValues by Revenue (with Means)') 72 | plt.ylabel('Page Values') 73 | plt.xlabel('Revenue') 74 | plt.tight_layout() 75 | plt.show() 76 | 77 | # 🔹 Fig 3: Histogram of Administrative Duration 78 | plt.figure(figsize=(10, 6)) 79 | sns.histplot(df['Administrative_Duration'], bins=30, kde=False, color='orange') 80 | plt.title('Fig 3: Distribution of Administrative Duration') 81 | plt.xlabel('Administrative Duration') 82 | plt.ylabel('Frequency') 83 | plt.tight_layout() 84 | plt.show() 85 | 86 | # 🔹 Fig 4: Scatter Plot of PageValues vs ExitRates 87 | plt.figure(figsize=(10, 6)) 88 | sns.scatterplot(data=df, x='PageValues', y='ExitRates', hue='Revenue', palette='coolwarm', alpha=0.6) 89 | plt.title('Fig 4: PageValues vs ExitRates') 90 | plt.xlabel('Page Values') 91 | plt.ylabel('Exit Rates') 92 | plt.tight_layout() 93 | plt.show() 94 | 95 | 96 | # 🔷 Pie Chart: Revenue Distribution 97 | revenue_counts = df['Revenue'].value_counts() 98 | plt.figure(figsize=(6, 6)) 99 | plt.pie(revenue_counts, labels=revenue_counts.index, autopct='%1.1f%%', startangle=140, 100 | colors=['#66b3ff', '#ff9999']) 101 | plt.title('Pie Chart: Revenue Distribution') 102 | plt.axis('equal') 103 | plt.tight_layout() 104 | plt.show() 105 | 106 | # 🔷 Box Plot: ProductRelated_Duration vs Revenue 107 | plt.figure(figsize=(8, 6)) 108 | sns.boxplot(data=df, x='Revenue', y='ProductRelated_Duration', hue='Revenue', 109 | palette='Set2', width=0.5, fliersize=3, dodge=False) 110 | plt.title('Box Plot: ProductRelated_Duration vs Revenue') 111 | plt.xlabel('Revenue') 112 | plt.ylabel('Product Related Duration') 113 | plt.tight_layout() 114 | plt.show() 115 | print("\n-------------------") 116 | print("🔎 Hypothesis Testing") 117 | print("-------------------") 118 | 119 | # Separate BounceRates for Revenue True and False 120 | revenue_true = df[df['Revenue'] == True]['BounceRates'] 121 | revenue_false = df[df['Revenue'] == False]['BounceRates'] 122 | 123 | # ------------------- Z-Test ------------------- 124 | # Assumes large sample size and known variance (approximate use case) 125 | z_stat, z_pval = stats.ttest_ind(revenue_true, revenue_false) 126 | 127 | print(f"\n📊 Z-Test (BounceRates for Revenue vs No Revenue)") 128 | print(f"Z-Statistic: {z_stat:.2f}") 129 | print(f"P-Value: {z_pval:.2f}") 130 | if z_pval < 0.05: 131 | print("Result: Significant difference ✅") 132 | else: 133 | print("Result: No significant difference ❌") 134 | 135 | # ------------------- T-Test ------------------- 136 | # Standard t-test for independent samples 137 | t_stat, t_pval = stats.ttest_ind(revenue_true, revenue_false, equal_var=False) 138 | 139 | print(f"\n📊 T-Test (BounceRates for Revenue vs No Revenue)") 140 | print(f"T-Statistic: {t_stat:.2f}") 141 | print(f"P-Value: {t_pval:.2f}") 142 | if t_pval < 0.05: 143 | print("Result: Significant difference ✅") 144 | else: 145 | print("Result: No significant difference ❌") 146 | 147 | 148 | 149 | 150 | --------------------------------------------------------------------------------