├── python_project_eda.py
└── report_format.doc


/python_project_eda.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | 
  6 | # Load the timesData.csv file
  7 | times_df = pd.read_csv("timesData.csv")
  8 | 
  9 | # Display the first few rows
 10 | print(times_df.head(30))
 11 | # Make a copy of the dataframe to clean
 12 | df_cleaned = times_df.copy()
 13 | 
 14 | # Replace commas and convert 'num_students' to numeric
 15 | df_cleaned['num_students'] = df_cleaned['num_students'].str.replace(',', '').astype(float)
 16 | 
 17 | # Convert 'international_students' percentage strings to floats
 18 | df_cleaned['international_students'] = df_cleaned['international_students'].str.replace('%', '')
 19 | df_cleaned['international_students'] = pd.to_numeric(df_cleaned['international_students'], errors='coerce')
 20 | 
 21 | # Convert 'female_male_ratio' to numerical female percentage
 22 | def extract_female_ratio(ratio_str):
 23 |     try:
 24 |         female, male = ratio_str.split(':')
 25 |         female = float(female.strip())
 26 |         male = float(male.strip())
 27 |         return 100 * female / (female + male)
 28 |     except:
 29 |         return None
 30 | 
 31 | df_cleaned['female_percentage'] = df_cleaned['female_male_ratio'].apply(extract_female_ratio)
 32 | 
 33 | # Convert relevant score columns to numeric (handle '-' as NaN)
 34 | score_columns = ['teaching', 'international', 'research', 'citations', 'income', 'total_score']
 35 | df_cleaned[score_columns] = df_cleaned[score_columns].replace('-', pd.NA).apply(pd.to_numeric)
 36 | 
 37 | # Summary statistics
 38 | summary = df_cleaned.describe(include='all')
 39 | 
 40 | # Plot: Top 10 countries by number of universities
 41 | plt.figure(figsize=(10, 6))
 42 | top_countries = df_cleaned['country'].value_counts().head(10)
 43 | sns.barplot(x=top_countries.values, y=top_countries.index,hue=top_countries.index, palette="cubehelix")
 44 | plt.title("Top 10 Countries by Number of Universities in the Dataset")
 45 | plt.xlabel("Number of Universities")
 46 | plt.ylabel("Country")
 47 | print(plt.show())
 48 | 
 49 | # Bar Plot: Score distributions over years
 50 | plt.figure(figsize=(10, 6))
 51 | sns.barplot(data=df_cleaned, x="year", y="total_score")
 52 | plt.title("Distribution of Total Scores Over the Years")
 53 | plt.ylabel("Total Score")
 54 | plt.xlabel("Year")
 55 | print(plt.show())
 56 | 
 57 | # Scatter Plot: Research vs. Teaching
 58 | plt.figure(figsize=(8, 6))
 59 | sns.scatterplot(data=df_cleaned, x='research', y='teaching', hue='country', alpha=0.6, legend=False)
 60 | plt.title("Research vs. Teaching Scores")
 61 | plt.xlabel("Research Score")
 62 | plt.ylabel("Teaching Score")
 63 | print(plt.show())
 64 | 
 65 | # Yearly trend: Average total score
 66 | plt.figure(figsize=(10, 5))
 67 | yearly_avg = df_cleaned.groupby("year")["total_score"].mean()
 68 | sns.lineplot(x=yearly_avg.index, y=yearly_avg.values, marker='o', color='green')
 69 | plt.title("Average Total Score Over the Years")
 70 | plt.ylabel("Average Total Score")
 71 | plt.xlabel("Year")
 72 | print(plt.show())
 73 | 
 74 | # Correlation between research and citations
 75 | plt.figure(figsize=(6, 6))
 76 | sns.regplot(data=df_cleaned, x="research", y="citations", scatter_kws={"alpha":0.3})
 77 | plt.title("Correlation Between Research and Citations")
 78 | plt.xlabel("Research Score")
 79 | plt.ylabel("Citations Score")# Top 5 universities in 3 selected countries
 80 | latest_year = df_cleaned['year'].max()
 81 | countries = ['United States of America', 'United Kingdom', 'Canada']
 82 | for country in countries:
 83 |     top_uni = df_cleaned[(df_cleaned['country'] == country) & (df_cleaned['year'] == latest_year)].nlargest(5, 'total_score')
 84 |     plt.figure(figsize=(9, 5))
 85 |     sns.barplot(data=top_uni, x='total_score', y='university_name', palette='magma')
 86 |     plt.title(f"Top 5 Universities in {country} ({latest_year})")
 87 |     plt.xlabel("Total Score")
 88 |     plt.ylabel("University")
 89 |     print(plt.show())
 90 | 
 91 | # Top 5 universities in 3 selected countries
 92 | countries = ['United States of America', 'United Kingdom', 'Canada']
 93 | for country in countries:
 94 |     top_uni = df_cleaned[(df_cleaned['country'] == country) & (df_cleaned['year'] == latest_year)].nlargest(5, 'total_score')
 95 |     plt.figure(figsize=(9, 5))
 96 |     sns.barplot(data=top_uni, x='total_score', y='university_name', palette='magma')
 97 |     plt.title(f"Top 5 Universities in {country} ({latest_year})")
 98 |     plt.xlabel("Total Score")
 99 |     plt.ylabel("University")
100 |     print(plt.show())
101 | 
102 | # Count of universities by region (basic example using 'country')
103 | plt.figure(figsize=(12, 6))
104 | sns.countplot(data=df_cleaned[df_cleaned['year'] == latest_year], y='country', order=df_cleaned[df_cleaned['year'] == latest_year]['country'].value_counts().head(10).index, palette='Set2')
105 | plt.title("University Count by Country ({latest_year})")
106 | plt.xlabel("Count")
107 | plt.ylabel("Country")
108 | print(plt.show())
109 | 


--------------------------------------------------------------------------------
/report_format.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/INDERJEET-KAUR-25/EDA-Python-project/07a1e6ee599b5f60299f8b443ab6b82512796662/report_format.doc


--------------------------------------------------------------------------------