├── README.md
└── python_project.py


/README.md:
--------------------------------------------------------------------------------
1 | # Data-Analysis-Python
2 | I have use python libraries in this project to clean a data set and then visualize and analyze it
3 | 


--------------------------------------------------------------------------------
/python_project.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | 
  4 | Created on Fri Apr 11 10:16:11 202
  5 | @author: Aayush Garg
  6 | """
  7 | import matplotlib.pyplot as plt
  8 | import seaborn as sns
  9 | from sklearn.linear_model import LinearRegression
 10 | from sklearn.model_selection import train_test_split
 11 | from sklearn.metrics import mean_squared_error, r2_score
 12 | import numpy as np
 13 | import pandas as pd
 14 | 
 15 | # Load the uploaded CSV file
 16 | #file_path = "/mnt/data/4dbe5667-7b6b-41d7-82af-211562424d9a_a0d71eebebaa5ba00b5d1af1dd96a3dd.csv"
 17 | 
 18 | df = pd.read_csv("C:/Users/shiti/Downloads/4dbe5667-7b6b-41d7-82af-211562424d9a_a0d71eebebaa5ba00b5d1af1dd96a3dd.csv")
 19 | 
 20 | # Display basic information and first few rows of the dataset
 21 | df.info(), df.head()
 22 | 
 23 | # Step 1: Data Cleaning
 24 | df_cleaned = df.copy()
 25 | 
 26 | # Convert date column to datetime
 27 | df_cleaned['CompanyRegistrationdate_date'] = pd.to_datetime(df_cleaned['CompanyRegistrationdate_date'], errors='coerce')
 28 | 
 29 | # Drop rows with any null values for simplicity
 30 | df_cleaned.dropna(inplace=True)
 31 | 
 32 | # Strip whitespaces and standardize case for some categorical columns
 33 | df_cleaned['CompanyStatus'] = df_cleaned['CompanyStatus'].str.strip().str.title()
 34 | df_cleaned['CompanyClass'] = df_cleaned['CompanyClass'].str.strip().str.title()
 35 | 
 36 | # Step 2: Data Visualization
 37 | 
 38 | # 1. Bar plot of top 10 most common Company States
 39 | plt.figure(figsize=(10, 5))
 40 | df_cleaned['CompanyStateCode'].value_counts().head(10).plot(kind='bar', color='skyblue')
 41 | plt.title('Top 10 States with Most Companies')
 42 | plt.ylabel('Number of Companies')
 43 | plt.xlabel('State Code')
 44 | plt.xticks(rotation=45)
 45 | plt.tight_layout()
 46 | plt.show()
 47 | 
 48 | # 2. Histogram of Authorized Capital
 49 | plt.figure(figsize=(10, 5))
 50 | sns.histplot(df_cleaned['AuthorizedCapital'], bins=50, kde=True, color='green')
 51 | plt.title('Distribution of Authorized Capital')
 52 | plt.xlabel('Authorized Capital')
 53 | plt.ylabel('Frequency')
 54 | plt.tight_layout()
 55 | plt.show()
 56 | 
 57 | # 3. Pie chart of Company Class distribution
 58 | class_counts = df_cleaned['CompanyClass'].value_counts()
 59 | plt.figure(figsize=(6, 6))
 60 | plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('Set2'))
 61 | plt.title('Company Class Distribution')
 62 | plt.axis('equal')
 63 | plt.show()
 64 | 
 65 | # 4. Box plot of Paid-up Capital by Company Status
 66 | plt.figure(figsize=(12, 6))
 67 | sns.boxplot(x='CompanyStatus', y='PaidupCapital', data=df_cleaned)
 68 | plt.title('Paid-up Capital by Company Status')
 69 | plt.xticks(rotation=45)
 70 | plt.tight_layout()
 71 | plt.show()
 72 | 
 73 | # 5. Correlation heatmap for numeric features
 74 | plt.figure(figsize=(8, 6))
 75 | corr = df_cleaned[['AuthorizedCapital', 'PaidupCapital', 'nic_code']].corr()
 76 | sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
 77 | plt.title('Correlation Heatmap')
 78 | plt.tight_layout()
 79 | plt.show()
 80 | 
 81 | # Step 3: Linear Regression to predict PaidupCapital using AuthorizedCapital
 82 | X = df_cleaned[['AuthorizedCapital']]
 83 | y = df_cleaned['PaidupCapital']
 84 | 
 85 | # Train-test split
 86 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 87 | 
 88 | # Linear Regression Model
 89 | model = LinearRegression()
 90 | model.fit(X_train, y_train)
 91 | 
 92 | # Predictions
 93 | y_pred = model.predict(X_test)
 94 | 
 95 | # Evaluation
 96 | mse = mean_squared_error(y_test, y_pred)
 97 | r2 = r2_score(y_test, y_pred)
 98 | 
 99 | # Output evaluation results and a sample plot
100 | plt.figure(figsize=(8, 5))
101 | sns.scatterplot(x=y_test, y=y_pred, alpha=0.5)
102 | plt.xlabel('Actual Paid-up Capital')
103 | plt.ylabel('Predicted Paid-up Capital')
104 | plt.title('Linear Regression: Actual vs Predicted')
105 | plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
106 | plt.tight_layout()
107 | plt.show()
108 | 
109 | mse, r2
110 | 
111 | 


--------------------------------------------------------------------------------