├── README.md └── PANDASPROJECT.py /README.md: -------------------------------------------------------------------------------- 1 | 📊 Analyzing Academic Trends Among VIT Vellore Students Using Python 2 | This project explores academic performance trends among students at VIT Vellore using Python-based data analysis techniques. The aim is to gain insights into CGPA distributions across different departments and academic years, identify top-performing branches, visualize overall student performance, and detect anomalies in the data. 3 | 4 | 🔍 Key Features: 5 | Data Cleaning & Preprocessing using Pandas and NumPy 6 | 7 | Exploratory Data Analysis (EDA) to uncover patterns in student CGPAs 8 | 9 | Visualizations with Matplotlib and Seaborn to represent trends and outliers 10 | 11 | Branch-wise and Year-wise Analysis to compare performance across departments 12 | 13 | Insightful Findings that could support academic planning and strategy 14 | 15 | 📁 Technologies Used: 16 | Python 17 | 18 | Pandas 19 | 20 | NumPy 21 | 22 | Matplotlib 23 | 24 | Seaborn 25 | 26 | 📌 Project Goals: 27 | Understand CGPA distribution patterns 28 | 29 | Identify high- and low-performing departments 30 | 31 | Visualize student performance over time 32 | 33 | Detect outliers and anomalies in the academic data 34 | 35 | -------------------------------------------------------------------------------- /PANDASPROJECT.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | # ---------------------------------------- 7 | # 1. Load and Clean the Dataset 8 | # ---------------------------------------- 9 | df = pd.read_csv("C:/Users/rajde/Desktop/dataset.csv") 10 | 11 | # Clean column names (lowercase, strip spaces) 12 | df.columns = df.columns.str.strip().str.lower() 13 | 14 | # Display basic information 15 | print("Dataset Info:") 16 | print(df.info()) 17 | print("\nFirst 5 rows:") 18 | print(df.head()) 19 | 20 | # ---------------------------------------- 21 | # 1A. Exploratory Data Analysis (EDA) 22 | # ---------------------------------------- 23 | 24 | # Shape of the dataset 25 | print(f"\nDataset contains {df.shape[0]} rows and {df.shape[1]} columns.\n") 26 | 27 | # Summary statistics 28 | print("Summary statistics:") 29 | print(df.describe(include='all')) 30 | 31 | # Check for missing values 32 | print("\nMissing values in each column:") 33 | print(df.isnull().sum()) 34 | 35 | # Check unique values in categorical columns 36 | print("\nUnique values in 'branch' and 'year':") 37 | print("Branches:", df['branch'].unique()) 38 | print("Years:", df['year'].unique()) 39 | 40 | # Countplot of students per year 41 | plt.figure(figsize=(8, 4)) 42 | sns.countplot(x='year', data=df, palette='Set2') 43 | plt.title("Number of Students per Year") 44 | plt.xlabel("Academic Year") 45 | plt.ylabel("Count") 46 | plt.tight_layout() 47 | plt.show() 48 | 49 | # Countplot of students per branch 50 | plt.figure(figsize=(10, 4)) 51 | sns.countplot(y='branch', data=df, palette='Set3', order=df['branch'].value_counts().index) 52 | plt.title("Number of Students per Branch") 53 | plt.xlabel("Count") 54 | plt.ylabel("Branch") 55 | plt.tight_layout() 56 | plt.show() 57 | 58 | # ---------------------------------------- 59 | # 2. Data Cleansing 60 | # ---------------------------------------- 61 | # Remove rows with missing CGPA or invalid entries 62 | df = df.dropna(subset=['cgpa', 'branch', 'year']) 63 | df = df[df['cgpa'].between(0, 10)] # Valid CGPA range 64 | 65 | # ---------------------------------------- 66 | # 3. Heatmap: Average CGPA by Year and Branch 67 | # ---------------------------------------- 68 | grouped = df.groupby(['year', 'branch'])['cgpa'].mean().unstack() 69 | 70 | plt.figure(figsize=(12, 6)) 71 | sns.heatmap(grouped, annot=True, fmt=".2f", cmap='YlGnBu', linewidths=0.5, linecolor='gray') 72 | plt.title("Heatmap: Average CGPA Across Years and Branches") 73 | plt.ylabel("Academic Year") 74 | plt.xlabel("Branch") 75 | plt.tight_layout() 76 | plt.show() 77 | 78 | # ---------------------------------------- 79 | # 4. Top Performing Branches 80 | # ---------------------------------------- 81 | top_branches = df.groupby('branch')['cgpa'].mean().sort_values(ascending=False) 82 | print("\nTop Performing Branches by Average CGPA:") 83 | print(top_branches) 84 | 85 | plt.figure(figsize=(10, 5)) 86 | sns.barplot(x=top_branches.index, y=top_branches.values, palette='viridis') 87 | plt.xticks(rotation=45) 88 | plt.ylabel("Average CGPA") 89 | plt.title("Branch-wise Average CGPA") 90 | plt.tight_layout() 91 | plt.show() 92 | 93 | # ---------------------------------------- 94 | # 5. Visualize CGPA Trends 95 | # ---------------------------------------- 96 | 97 | # Histogram of CGPA 98 | plt.figure(figsize=(10, 5)) 99 | sns.histplot(df['cgpa'], bins=20, kde=True, color='skyblue') 100 | plt.title("Distribution of CGPA") 101 | plt.xlabel("CGPA") 102 | plt.ylabel("Number of Students") 103 | plt.tight_layout() 104 | plt.show() 105 | 106 | # Boxplot by Branch 107 | plt.figure(figsize=(12, 6)) 108 | sns.boxplot(x='branch', y='cgpa', data=df) 109 | plt.xticks(rotation=45) 110 | plt.title("CGPA Distribution by Branch") 111 | plt.tight_layout() 112 | plt.show() 113 | 114 | # Boxplot by Year 115 | plt.figure(figsize=(10, 6)) 116 | sns.boxplot(x='year', y='cgpa', data=df) 117 | plt.title("CGPA Distribution by Year") 118 | plt.tight_layout() 119 | plt.show() 120 | 121 | # ---------------------------------------- 122 | # 6. Detect Outliers 123 | # ---------------------------------------- 124 | Q1 = df['cgpa'].quantile(0.25) 125 | Q3 = df['cgpa'].quantile(0.75) 126 | IQR = Q3 - Q1 127 | lower_bound = Q1 - 1.5 * IQR 128 | upper_bound = Q3 + 1.5 * IQR 129 | 130 | outliers = df[(df['cgpa'] < lower_bound) | (df['cgpa'] > upper_bound)] 131 | print(f"\nOutliers Detected (CGPA < {lower_bound:.2f} or > {upper_bound:.2f}):") 132 | print(outliers[['name', 'branch', 'year', 'cgpa']]) 133 | 134 | # ---------------------------------------- 135 | # 7. Summary Report 136 | # ---------------------------------------- 137 | 138 | # Top 10 Overall Performers 139 | top10_overall = df.sort_values(by='cgpa', ascending=False).head(10) 140 | print("\nTop 10 Performers Overall:") 141 | print(top10_overall[['name', 'branch', 'year', 'cgpa']]) 142 | 143 | # Top 10 per Branch 144 | print("\nTop 10 Performers per Branch:") 145 | top_per_branch = df.groupby('branch').apply(lambda x: x.sort_values(by='cgpa', ascending=False).head(10)).reset_index(drop=True) 146 | print(top_per_branch[['name', 'branch', 'year', 'cgpa']]) 147 | 148 | # ---------------------------------------- 149 | # 8. Donut Chart – Student Distribution by Branch 150 | # ---------------------------------------- 151 | branch_counts = df['branch'].value_counts() 152 | 153 | plt.figure(figsize=(8, 8)) 154 | colors = sns.color_palette('pastel')[0:len(branch_counts)] 155 | plt.pie(branch_counts, labels=branch_counts.index, colors=colors, startangle=90, 156 | wedgeprops={'width': 0.4}, autopct='%1.1f%%') 157 | plt.title("Distribution of Students by Branch") 158 | plt.gca().set_aspect('equal') 159 | plt.tight_layout() 160 | plt.show() 161 | 162 | # ---------------------------------------- 163 | # 9. Correlation Heatmap (Numeric Features) 164 | # ---------------------------------------- 165 | plt.figure(figsize=(6, 4)) 166 | numeric_corr = df.select_dtypes(include=np.number).corr() 167 | sns.heatmap(numeric_corr, annot=True, cmap='coolwarm', fmt=".2f") 168 | plt.title("Heatmap: Correlation Between Numeric Features") 169 | plt.tight_layout() 170 | plt.show() 171 | --------------------------------------------------------------------------------