├── 1.py └── README.md /1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | # Loading dataset 7 | dt=pd.read_csv("C:\\Users\\HP\\OneDrive\\Desktop\\toolbox\\Electric_Vehicle_Population_Data1.csv",encoding="unicode_escape") 8 | print(dt) 9 | 10 | #Exploring dataset 11 | print("Information: \n",dt.info()) 12 | print("Description: \n",dt.describe()) 13 | 14 | #Handling Missing Values 15 | print("Missing values before handling:\n", dt.isnull().sum()) 16 | dt = dt.dropna(subset=['Model', 'County', 'City', 'State', 'Postal Code', 'Electric Vehicle Type', 'Base MSRP', 'Legislative District', 'DOL Vehicle ID', 'Vehicle Location', 'Electric Utility', '2020 Census Tract']) 17 | dt['Electric Range'] = dt['Electric Range'].fillna(0) 18 | dt['Base MSRP'] = dt['Base MSRP'].fillna(0) 19 | dt['Clean Alternative Fuel Vehicle (CAFV) Eligibility'] = dt['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].fillna("Unknown") 20 | print("missing values ",dt.isnull().sum()) 21 | 22 | #remove duplicate rows 23 | dt=dt.drop_duplicates() 24 | print(dt) 25 | 26 | #Basic operation Performed 27 | print("1st 12 rows of Dataset: \n",dt.head(12)) 28 | print("1st 12 rows of Dataset: \n",dt.tail(12)) 29 | print("Shape of Dataset: \n",dt.shape) 30 | print("Column of Dataset: \n",dt.columns) 31 | print("Datatype of Dataset: \n",dt.dtypes) 32 | dt.to_csv("cleaned_dataset.csv", index=False) 33 | print("New Dataset Succesfully") 34 | 35 | # Remove the column from given dataset 36 | print(dt.drop(['Electric Utility','2020 Census Tract'],axis=1,inplace=True)) 37 | print("Information: \n",dt.info()) 38 | 39 | # change the column 40 | print(dt.columns) 41 | dt.columns = dt.columns.str.strip() 42 | dt.rename(columns={'Electric Vehicle Type': 'EVT'},inplace=True) 43 | print(dt.columns) 44 | print(dt.columns) 45 | dt.columns = dt.columns.str.strip() 46 | dt.rename(columns={'Legislative District': 'LD'},inplace=True) 47 | print(dt.columns) 48 | 49 | # Clean column names 50 | dt.columns = dt.columns.str.strip() 51 | 52 | #find top EV locations 53 | dt = dt.dropna(subset=['City', 'County', 'Postal Code']) #any missing values 54 | 55 | #Top 10 cities with most evs 56 | top_cities = dt['City'].value_counts().head(10) 57 | print("Top 10 Cities:\n", top_cities) 58 | 59 | # Countplot for LD 60 | plt.figure(figsize=(10, 6)) 61 | sns.countplot(x='Model Year', hue='Model Year', data=dt, palette='coolwarm', legend=False) 62 | plt.title("Count Plot") 63 | plt.show() 64 | 65 | # Scatterplot for Age vs Salary 66 | sns.scatterplot(x='Electric Range', y='LD', data=dt, hue='LD', palette='coolwarm') 67 | plt.title("Scatter Plot") 68 | plt.show() 69 | 70 | # Select a subset of data with fewer unique cities and postal codes 71 | top_cities = dt['City'].value_counts().nlargest(10).index # Top 10 cities 72 | filtered_data = dt[dt['City'].isin(top_cities)] 73 | 74 | #select a numerical column 75 | column='LD' 76 | plt.figure(figsize=(8,5)) 77 | sns.boxplot(x=dt[column]) 78 | plt.title("Boxplot for outliers Detection") 79 | 80 | #The distribution is slightly skewed left 81 | Q1=dt[column].quantile(0.25) 82 | print('Q1:',Q1) 83 | Q3=dt[column].quantile(0.75) 84 | print('Q3:',Q3) 85 | IQR=Q3-Q1 86 | print('IQR:',IQR) 87 | 88 | lower_bound=Q1-1.5*IQR 89 | print('lower bound:',lower_bound) 90 | upper_bound=Q3+1.5*IQR 91 | print('upper bound:',upper_bound) 92 | 93 | # Identifying outliers 94 | outliers = dt[(dt[column] < lower_bound) | (dt[column] > upper_bound)] 95 | print("Outliers detected:\n",outliers) 96 | 97 | # Boxplot of DOL Vehicle ID 98 | plt.figure(figsize=(8, 5)) 99 | sns.boxplot(x=dt['DOL Vehicle ID'], color='teal') 100 | plt.title("Boxplot of DOL Vehicle ID") 101 | plt.xlabel("DOL Vehicle ID") 102 | plt.grid(True) 103 | plt.show() 104 | 105 | # Distribution of Electric Range 106 | plt.figure(figsize=(8, 5)) 107 | sns.histplot(dt['Electric Range'], bins=30, kde=True, color='purple') 108 | plt.title("Distribution of Electric Range") 109 | plt.xlabel("Electric Range") 110 | plt.ylabel("Frequency") 111 | plt.grid(True) 112 | plt.show() 113 | 114 | # City-wise EV Count (Top 10) 115 | top_cities = dt['City'].value_counts().head(10) 116 | plt.figure(figsize=(10, 6)) 117 | sns.barplot(x=top_cities.values, y=top_cities.index, palette='magma') 118 | plt.title("Top 10 Cities with Most EVs") 119 | plt.xlabel("Number of EVs") 120 | plt.ylabel("City") 121 | plt.grid(axis='x', linestyle='--', alpha=0.7) 122 | plt.show() 123 | 124 | # Fill missing numerical values with 0 and categorical with 'Unknown' 125 | dt['Electric Range'] = dt['Electric Range'].fillna(0) 126 | dt['Base MSRP'] = dt['Base MSRP'].fillna(0) 127 | dt['Clean Alternative Fuel Vehicle (CAFV) Eligibility'] = dt['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].fillna("Unknown") 128 | 129 | # Drop rows with missing critical values 130 | dt.dropna(subset=['Model', 'County', 'City', 'Postal Code', 'EVT', 'Base MSRP'], inplace=True) 131 | 132 | # Drop duplicates 133 | dt.drop_duplicates(inplace=True) 134 | 135 | # Create correlation heatmap for numeric columns 136 | plt.figure(figsize=(10, 6)) 137 | sns.heatmap(dt.corr(numeric_only=True), annot=True, cmap='coolwarm') 138 | plt.title("Correlation Heatmap") 139 | plt.show() 140 | 141 | # Pairplot for numerical features 142 | num_cols = dt.select_dtypes(include=['int64', 'float64']).columns 143 | sns.pairplot(dt[num_cols]) 144 | plt.suptitle("Pairplot of Numerical Features", y=1.02) 145 | plt.show() 146 | 147 | # Barplot for top 10 Postal Codes 148 | top_postal = dt['Postal Code'].value_counts().head(10) 149 | plt.figure(figsize=(10, 5)) 150 | sns.barplot(x=top_postal.index, y=top_postal.values, palette='magma') 151 | plt.title("Top 10 Postal Codes") 152 | plt.ylabel("Count") 153 | plt.xticks(rotation=45) 154 | plt.tight_layout() 155 | plt.show() 156 | 157 | # Boxplot of Electric Range by City (top 10 cities) 158 | top_cities = dt['City'].value_counts().head(10).index 159 | filtered_dt = dt[dt['City'].isin(top_cities)] 160 | plt.figure(figsize=(12, 6)) 161 | sns.boxplot(x='City', y='Electric Range', data=filtered_dt) 162 | plt.title("Electric Range by City") 163 | plt.xticks(rotation=45) 164 | plt.tight_layout() 165 | plt.show() 166 | 167 | # EV Type Distribution in Top 5 Counties 168 | top5_counties = dt['County'].value_counts().head(5).index 169 | filtered = dt[dt['County'].isin(top5_counties)] 170 | plt.figure(figsize=(13, 9)) 171 | sns.countplot(data=filtered, x='County', hue='EVT', palette='Set1') 172 | plt.title("EV Type Distribution in Top 5 Counties") 173 | plt.xlabel("County") 174 | plt.ylabel("Count") 175 | plt.legend(title='EV Type') 176 | plt.show() 177 | 178 | # Most Popular EV Models 179 | top_models = dt['Model'].value_counts().head(10) 180 | plt.figure(figsize=(15, 10)) 181 | sns.barplot(x=top_models.values, y=top_models.index, palette='Accent') 182 | plt.title("Top 10 Most Popular EV Models") 183 | plt.xlabel("Count") 184 | plt.ylabel("Model") 185 | plt.grid(axis='x', linestyle='--', alpha=0.7) 186 | plt.show() 187 | 188 | # Average Electric Range by Model (Top 10) 189 | avg_range_by_model = dt.groupby('Model')['Electric Range'].mean().sort_values(ascending=False).head(10) 190 | plt.figure(figsize=(12, 6)) 191 | sns.barplot(x=avg_range_by_model.values, y=avg_range_by_model.index, palette='cool') 192 | plt.title("Average Electric Range by Top EV Models") 193 | plt.xlabel("Average Electric Range") 194 | plt.ylabel("Model") 195 | plt.grid(axis='x', linestyle='--', alpha=0.7) 196 | plt.show() 197 | 198 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python-project- 2 | Data-Visualization 3 | --------------------------------------------------------------------------------