├── CA2 python.docx
├── README.md
├── ca2.py
└── online_shoppers_intention.csv


/CA2 python.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sanchichauhan/PYTHON-PROJECT-/37b8fd838d9b5520fd3e75ec0fc666514058d49f/CA2 python.docx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 |                                                                                                                                                                                                                                                        
 2 |                                                                                                                                                                                                                                                                      
 3 |                                                                                                                                                                                                                                                                      
 4 |                                                                                                                                                                                                                                                                      
 5 |                                                                                                                                                                                                                                                                      
 6 |                                                                                                                                                                                                                                                                      
 7 |                                                                                                                                                                                                                                                                      
 8 |                                                                                                                                                                                                                                                                      
 9 |                                                                                                                                                                                                                                                                      
10 |                                                                                                                                                                                                                                                                      
11 | 


--------------------------------------------------------------------------------
/ca2.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import seaborn as sns
  3 | import matplotlib.pyplot as plt
  4 | from scipy import stats
  5 | 
  6 | 
  7 | # Load the dataset
  8 | df = pd.read_csv(r"C:\Users\acer\Downloads\online+shoppers+purchasing+intention+dataset (1)\online_shoppers_intention.csv")
  9 | 
 10 | # 1. Shape of the dataset
 11 | print(" Shape of the dataset:", df.shape)
 12 | 
 13 | # 2. Column names
 14 | print("\n Column names:")
 15 | print(df.columns.tolist())
 16 | 
 17 | # 3. Data types of each column
 18 | print("\n Data types:")
 19 | print(df.dtypes)
 20 | 
 21 | # 4. Check for missing values
 22 | print("\n Missing values per column:")
 23 | print(df.isnull().sum())
 24 | 
 25 | # 5. Show first 5 rows as sample
 26 | print("\n First 5 rows of the dataset:")
 27 | print(df.head())
 28 | 
 29 | # 6. Summary statistics for numeric columns
 30 | print("\n Summary statistics:")
 31 | print(df.describe())
 32 | 
 33 | 
 34 | # 7. Count of unique values per column
 35 | print("\n Unique value counts per column:")
 36 | print(df.nunique())
 37 | 
 38 | # 8. Count of each class in target variable (Revenue)
 39 | print("\n Target variable class distribution:")
 40 | print(df['Revenue'].value_counts())
 41 | 
 42 | sns.set(style="whitegrid")
 43 | 
 44 | sns.set(style="whitegrid")
 45 | 
 46 | sns.set(style="whitegrid")
 47 | 
 48 | # 🔥 Fig 0: Correlation Heatmap for Numeric Features
 49 | plt.figure(figsize=(12, 10))
 50 | numeric_df = df.select_dtypes(include='number')  # Select only numeric columns
 51 | corr_matrix = numeric_df.corr()
 52 | sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
 53 | plt.title('Fig 0: Heatmap of Feature Correlations')
 54 | plt.tight_layout()
 55 | plt.show()
 56 | 
 57 | # 🔹 Fig 1: Countplot - Number of Sessions per Month by Revenue
 58 | plt.figure(figsize=(10, 6))
 59 | sns.countplot(data=df, x='Month', hue='Revenue', palette='Set2', order=[
 60 |     'Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
 61 | plt.title('Fig 1: Sessions per Month by Revenue')
 62 | plt.ylabel('Number of Sessions')
 63 | plt.xlabel('Month')
 64 | plt.xticks(rotation=45)
 65 | plt.tight_layout()
 66 | plt.show()
 67 | 
 68 | # 🔹 Fig 2: Box Plot of PageValues by Revenue
 69 | plt.figure(figsize=(8, 6))
 70 | sns.boxplot(data=df, x='Revenue', y='PageValues', width=0.5, fliersize=3)
 71 | plt.title('Fig 2: PageValues by Revenue (with Means)')
 72 | plt.ylabel('Page Values')
 73 | plt.xlabel('Revenue')
 74 | plt.tight_layout()
 75 | plt.show()
 76 | 
 77 | # 🔹 Fig 3: Histogram of Administrative Duration
 78 | plt.figure(figsize=(10, 6))
 79 | sns.histplot(df['Administrative_Duration'], bins=30, kde=False, color='orange')
 80 | plt.title('Fig 3: Distribution of Administrative Duration')
 81 | plt.xlabel('Administrative Duration')
 82 | plt.ylabel('Frequency')
 83 | plt.tight_layout()
 84 | plt.show()
 85 | 
 86 | # 🔹 Fig 4: Scatter Plot of PageValues vs ExitRates
 87 | plt.figure(figsize=(10, 6))
 88 | sns.scatterplot(data=df, x='PageValues', y='ExitRates', hue='Revenue', palette='coolwarm', alpha=0.6)
 89 | plt.title('Fig 4: PageValues vs ExitRates')
 90 | plt.xlabel('Page Values')
 91 | plt.ylabel('Exit Rates')
 92 | plt.tight_layout()
 93 | plt.show()
 94 | 
 95 | 
 96 | # 🔷 Pie Chart: Revenue Distribution
 97 | revenue_counts = df['Revenue'].value_counts()
 98 | plt.figure(figsize=(6, 6))
 99 | plt.pie(revenue_counts, labels=revenue_counts.index, autopct='%1.1f%%', startangle=140,
100 |         colors=['#66b3ff', '#ff9999'])
101 | plt.title('Pie Chart: Revenue Distribution')
102 | plt.axis('equal')
103 | plt.tight_layout()
104 | plt.show()
105 | 
106 | # 🔷 Box Plot: ProductRelated_Duration vs Revenue
107 | plt.figure(figsize=(8, 6))
108 | sns.boxplot(data=df, x='Revenue', y='ProductRelated_Duration', hue='Revenue',
109 |             palette='Set2', width=0.5, fliersize=3, dodge=False)
110 | plt.title('Box Plot: ProductRelated_Duration vs Revenue')
111 | plt.xlabel('Revenue')
112 | plt.ylabel('Product Related Duration')
113 | plt.tight_layout()
114 | plt.show()
115 | print("\n-------------------")
116 | print("🔎 Hypothesis Testing")
117 | print("-------------------")
118 | 
119 | # Separate BounceRates for Revenue True and False
120 | revenue_true = df[df['Revenue'] == True]['BounceRates']
121 | revenue_false = df[df['Revenue'] == False]['BounceRates']
122 | 
123 | # ------------------- Z-Test -------------------
124 | # Assumes large sample size and known variance (approximate use case)
125 | z_stat, z_pval = stats.ttest_ind(revenue_true, revenue_false)
126 | 
127 | print(f"\n📊 Z-Test (BounceRates for Revenue vs No Revenue)")
128 | print(f"Z-Statistic: {z_stat:.2f}")
129 | print(f"P-Value: {z_pval:.2f}")
130 | if z_pval < 0.05:
131 |     print("Result: Significant difference ✅")
132 | else:
133 |     print("Result: No significant difference ❌")
134 | 
135 | # ------------------- T-Test -------------------
136 | # Standard t-test for independent samples
137 | t_stat, t_pval = stats.ttest_ind(revenue_true, revenue_false, equal_var=False)
138 | 
139 | print(f"\n📊 T-Test (BounceRates for Revenue vs No Revenue)")
140 | print(f"T-Statistic: {t_stat:.2f}")
141 | print(f"P-Value: {t_pval:.2f}")
142 | if t_pval < 0.05:
143 |     print("Result: Significant difference ✅")
144 | else:
145 |     print("Result: No significant difference ❌")
146 | 
147 | 
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------