├── README.md └── python_project.py /README.md: -------------------------------------------------------------------------------- 1 | # Data-Analysis-Python 2 | I have use python libraries in this project to clean a data set and then visualize and analyze it 3 | -------------------------------------------------------------------------------- /python_project.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | Created on Fri Apr 11 10:16:11 202 5 | @author: Aayush Garg 6 | """ 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | from sklearn.linear_model import LinearRegression 10 | from sklearn.model_selection import train_test_split 11 | from sklearn.metrics import mean_squared_error, r2_score 12 | import numpy as np 13 | import pandas as pd 14 | 15 | # Load the uploaded CSV file 16 | #file_path = "/mnt/data/4dbe5667-7b6b-41d7-82af-211562424d9a_a0d71eebebaa5ba00b5d1af1dd96a3dd.csv" 17 | 18 | df = pd.read_csv("C:/Users/shiti/Downloads/4dbe5667-7b6b-41d7-82af-211562424d9a_a0d71eebebaa5ba00b5d1af1dd96a3dd.csv") 19 | 20 | # Display basic information and first few rows of the dataset 21 | df.info(), df.head() 22 | 23 | # Step 1: Data Cleaning 24 | df_cleaned = df.copy() 25 | 26 | # Convert date column to datetime 27 | df_cleaned['CompanyRegistrationdate_date'] = pd.to_datetime(df_cleaned['CompanyRegistrationdate_date'], errors='coerce') 28 | 29 | # Drop rows with any null values for simplicity 30 | df_cleaned.dropna(inplace=True) 31 | 32 | # Strip whitespaces and standardize case for some categorical columns 33 | df_cleaned['CompanyStatus'] = df_cleaned['CompanyStatus'].str.strip().str.title() 34 | df_cleaned['CompanyClass'] = df_cleaned['CompanyClass'].str.strip().str.title() 35 | 36 | # Step 2: Data Visualization 37 | 38 | # 1. Bar plot of top 10 most common Company States 39 | plt.figure(figsize=(10, 5)) 40 | df_cleaned['CompanyStateCode'].value_counts().head(10).plot(kind='bar', color='skyblue') 41 | plt.title('Top 10 States with Most Companies') 42 | plt.ylabel('Number of Companies') 43 | plt.xlabel('State Code') 44 | plt.xticks(rotation=45) 45 | plt.tight_layout() 46 | plt.show() 47 | 48 | # 2. Histogram of Authorized Capital 49 | plt.figure(figsize=(10, 5)) 50 | sns.histplot(df_cleaned['AuthorizedCapital'], bins=50, kde=True, color='green') 51 | plt.title('Distribution of Authorized Capital') 52 | plt.xlabel('Authorized Capital') 53 | plt.ylabel('Frequency') 54 | plt.tight_layout() 55 | plt.show() 56 | 57 | # 3. Pie chart of Company Class distribution 58 | class_counts = df_cleaned['CompanyClass'].value_counts() 59 | plt.figure(figsize=(6, 6)) 60 | plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Set2')) 61 | plt.title('Company Class Distribution') 62 | plt.axis('equal') 63 | plt.show() 64 | 65 | # 4. Box plot of Paid-up Capital by Company Status 66 | plt.figure(figsize=(12, 6)) 67 | sns.boxplot(x='CompanyStatus', y='PaidupCapital', data=df_cleaned) 68 | plt.title('Paid-up Capital by Company Status') 69 | plt.xticks(rotation=45) 70 | plt.tight_layout() 71 | plt.show() 72 | 73 | # 5. Correlation heatmap for numeric features 74 | plt.figure(figsize=(8, 6)) 75 | corr = df_cleaned[['AuthorizedCapital', 'PaidupCapital', 'nic_code']].corr() 76 | sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f") 77 | plt.title('Correlation Heatmap') 78 | plt.tight_layout() 79 | plt.show() 80 | 81 | # Step 3: Linear Regression to predict PaidupCapital using AuthorizedCapital 82 | X = df_cleaned[['AuthorizedCapital']] 83 | y = df_cleaned['PaidupCapital'] 84 | 85 | # Train-test split 86 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 87 | 88 | # Linear Regression Model 89 | model = LinearRegression() 90 | model.fit(X_train, y_train) 91 | 92 | # Predictions 93 | y_pred = model.predict(X_test) 94 | 95 | # Evaluation 96 | mse = mean_squared_error(y_test, y_pred) 97 | r2 = r2_score(y_test, y_pred) 98 | 99 | # Output evaluation results and a sample plot 100 | plt.figure(figsize=(8, 5)) 101 | sns.scatterplot(x=y_test, y=y_pred, alpha=0.5) 102 | plt.xlabel('Actual Paid-up Capital') 103 | plt.ylabel('Predicted Paid-up Capital') 104 | plt.title('Linear Regression: Actual vs Predicted') 105 | plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--') 106 | plt.tight_layout() 107 | plt.show() 108 | 109 | mse, r2 110 | 111 | --------------------------------------------------------------------------------