├── README.md
└── PANDASPROJECT.py


/README.md:
--------------------------------------------------------------------------------
 1 | 📊 Analyzing Academic Trends Among VIT Vellore Students Using Python
 2 | This project explores academic performance trends among students at VIT Vellore using Python-based data analysis techniques. The aim is to gain insights into CGPA distributions across different departments and academic years, identify top-performing branches, visualize overall student performance, and detect anomalies in the data.
 3 | 
 4 | 🔍 Key Features:
 5 | Data Cleaning & Preprocessing using Pandas and NumPy
 6 | 
 7 | Exploratory Data Analysis (EDA) to uncover patterns in student CGPAs
 8 | 
 9 | Visualizations with Matplotlib and Seaborn to represent trends and outliers
10 | 
11 | Branch-wise and Year-wise Analysis to compare performance across departments
12 | 
13 | Insightful Findings that could support academic planning and strategy
14 | 
15 | 📁 Technologies Used:
16 | Python
17 | 
18 | Pandas
19 | 
20 | NumPy
21 | 
22 | Matplotlib
23 | 
24 | Seaborn
25 | 
26 | 📌 Project Goals:
27 | Understand CGPA distribution patterns
28 | 
29 | Identify high- and low-performing departments
30 | 
31 | Visualize student performance over time
32 | 
33 | Detect outliers and anomalies in the academic data
34 | 
35 | 


--------------------------------------------------------------------------------
/PANDASPROJECT.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | 
  6 | # ----------------------------------------
  7 | # 1. Load and Clean the Dataset
  8 | # ----------------------------------------
  9 | df = pd.read_csv("C:/Users/rajde/Desktop/dataset.csv")
 10 | 
 11 | # Clean column names (lowercase, strip spaces)
 12 | df.columns = df.columns.str.strip().str.lower()
 13 | 
 14 | # Display basic information
 15 | print("Dataset Info:")
 16 | print(df.info())
 17 | print("\nFirst 5 rows:")
 18 | print(df.head())
 19 | 
 20 | # ----------------------------------------
 21 | # 1A. Exploratory Data Analysis (EDA)
 22 | # ----------------------------------------
 23 | 
 24 | # Shape of the dataset
 25 | print(f"\nDataset contains {df.shape[0]} rows and {df.shape[1]} columns.\n")
 26 | 
 27 | # Summary statistics
 28 | print("Summary statistics:")
 29 | print(df.describe(include='all'))
 30 | 
 31 | # Check for missing values
 32 | print("\nMissing values in each column:")
 33 | print(df.isnull().sum())
 34 | 
 35 | # Check unique values in categorical columns
 36 | print("\nUnique values in 'branch' and 'year':")
 37 | print("Branches:", df['branch'].unique())
 38 | print("Years:", df['year'].unique())
 39 | 
 40 | # Countplot of students per year
 41 | plt.figure(figsize=(8, 4))
 42 | sns.countplot(x='year', data=df, palette='Set2')
 43 | plt.title("Number of Students per Year")
 44 | plt.xlabel("Academic Year")
 45 | plt.ylabel("Count")
 46 | plt.tight_layout()
 47 | plt.show()
 48 | 
 49 | # Countplot of students per branch
 50 | plt.figure(figsize=(10, 4))
 51 | sns.countplot(y='branch', data=df, palette='Set3', order=df['branch'].value_counts().index)
 52 | plt.title("Number of Students per Branch")
 53 | plt.xlabel("Count")
 54 | plt.ylabel("Branch")
 55 | plt.tight_layout()
 56 | plt.show()
 57 | 
 58 | # ----------------------------------------
 59 | # 2. Data Cleansing
 60 | # ----------------------------------------
 61 | # Remove rows with missing CGPA or invalid entries
 62 | df = df.dropna(subset=['cgpa', 'branch', 'year'])
 63 | df = df[df['cgpa'].between(0, 10)]  # Valid CGPA range
 64 | 
 65 | # ----------------------------------------
 66 | # 3. Heatmap: Average CGPA by Year and Branch
 67 | # ----------------------------------------
 68 | grouped = df.groupby(['year', 'branch'])['cgpa'].mean().unstack()
 69 | 
 70 | plt.figure(figsize=(12, 6))
 71 | sns.heatmap(grouped, annot=True, fmt=".2f", cmap='YlGnBu', linewidths=0.5, linecolor='gray')
 72 | plt.title("Heatmap: Average CGPA Across Years and Branches")
 73 | plt.ylabel("Academic Year")
 74 | plt.xlabel("Branch")
 75 | plt.tight_layout()
 76 | plt.show()
 77 | 
 78 | # ----------------------------------------
 79 | # 4. Top Performing Branches
 80 | # ----------------------------------------
 81 | top_branches = df.groupby('branch')['cgpa'].mean().sort_values(ascending=False)
 82 | print("\nTop Performing Branches by Average CGPA:")
 83 | print(top_branches)
 84 | 
 85 | plt.figure(figsize=(10, 5))
 86 | sns.barplot(x=top_branches.index, y=top_branches.values, palette='viridis')
 87 | plt.xticks(rotation=45)
 88 | plt.ylabel("Average CGPA")
 89 | plt.title("Branch-wise Average CGPA")
 90 | plt.tight_layout()
 91 | plt.show()
 92 | 
 93 | # ----------------------------------------
 94 | # 5. Visualize CGPA Trends
 95 | # ----------------------------------------
 96 | 
 97 | # Histogram of CGPA
 98 | plt.figure(figsize=(10, 5))
 99 | sns.histplot(df['cgpa'], bins=20, kde=True, color='skyblue')
100 | plt.title("Distribution of CGPA")
101 | plt.xlabel("CGPA")
102 | plt.ylabel("Number of Students")
103 | plt.tight_layout()
104 | plt.show()
105 | 
106 | # Boxplot by Branch
107 | plt.figure(figsize=(12, 6))
108 | sns.boxplot(x='branch', y='cgpa', data=df)
109 | plt.xticks(rotation=45)
110 | plt.title("CGPA Distribution by Branch")
111 | plt.tight_layout()
112 | plt.show()
113 | 
114 | # Boxplot by Year
115 | plt.figure(figsize=(10, 6))
116 | sns.boxplot(x='year', y='cgpa', data=df)
117 | plt.title("CGPA Distribution by Year")
118 | plt.tight_layout()
119 | plt.show()
120 | 
121 | # ----------------------------------------
122 | # 6. Detect Outliers
123 | # ----------------------------------------
124 | Q1 = df['cgpa'].quantile(0.25)
125 | Q3 = df['cgpa'].quantile(0.75)
126 | IQR = Q3 - Q1
127 | lower_bound = Q1 - 1.5 * IQR
128 | upper_bound = Q3 + 1.5 * IQR
129 | 
130 | outliers = df[(df['cgpa'] < lower_bound) | (df['cgpa'] > upper_bound)]
131 | print(f"\nOutliers Detected (CGPA < {lower_bound:.2f} or > {upper_bound:.2f}):")
132 | print(outliers[['name', 'branch', 'year', 'cgpa']])
133 | 
134 | # ----------------------------------------
135 | # 7. Summary Report
136 | # ----------------------------------------
137 | 
138 | # Top 10 Overall Performers
139 | top10_overall = df.sort_values(by='cgpa', ascending=False).head(10)
140 | print("\nTop 10 Performers Overall:")
141 | print(top10_overall[['name', 'branch', 'year', 'cgpa']])
142 | 
143 | # Top 10 per Branch
144 | print("\nTop 10 Performers per Branch:")
145 | top_per_branch = df.groupby('branch').apply(lambda x: x.sort_values(by='cgpa', ascending=False).head(10)).reset_index(drop=True)
146 | print(top_per_branch[['name', 'branch', 'year', 'cgpa']])
147 | 
148 | # ----------------------------------------
149 | # 8. Donut Chart – Student Distribution by Branch
150 | # ----------------------------------------
151 | branch_counts = df['branch'].value_counts()
152 | 
153 | plt.figure(figsize=(8, 8))
154 | colors = sns.color_palette('pastel')[0:len(branch_counts)]
155 | plt.pie(branch_counts, labels=branch_counts.index, colors=colors, startangle=90,
156 |         wedgeprops={'width': 0.4}, autopct='%1.1f%%')
157 | plt.title("Distribution of Students by Branch")
158 | plt.gca().set_aspect('equal')
159 | plt.tight_layout()
160 | plt.show()
161 | 
162 | # ----------------------------------------
163 | # 9. Correlation Heatmap (Numeric Features)
164 | # ----------------------------------------
165 | plt.figure(figsize=(6, 4))
166 | numeric_corr = df.select_dtypes(include=np.number).corr()
167 | sns.heatmap(numeric_corr, annot=True, cmap='coolwarm', fmt=".2f")
168 | plt.title("Heatmap: Correlation Between Numeric Features")
169 | plt.tight_layout()
170 | plt.show()
171 | 


--------------------------------------------------------------------------------