├── python_project_eda.py └── report_format.doc /python_project_eda.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | # Load the timesData.csv file 7 | times_df = pd.read_csv("timesData.csv") 8 | 9 | # Display the first few rows 10 | print(times_df.head(30)) 11 | # Make a copy of the dataframe to clean 12 | df_cleaned = times_df.copy() 13 | 14 | # Replace commas and convert 'num_students' to numeric 15 | df_cleaned['num_students'] = df_cleaned['num_students'].str.replace(',', '').astype(float) 16 | 17 | # Convert 'international_students' percentage strings to floats 18 | df_cleaned['international_students'] = df_cleaned['international_students'].str.replace('%', '') 19 | df_cleaned['international_students'] = pd.to_numeric(df_cleaned['international_students'], errors='coerce') 20 | 21 | # Convert 'female_male_ratio' to numerical female percentage 22 | def extract_female_ratio(ratio_str): 23 | try: 24 | female, male = ratio_str.split(':') 25 | female = float(female.strip()) 26 | male = float(male.strip()) 27 | return 100 * female / (female + male) 28 | except: 29 | return None 30 | 31 | df_cleaned['female_percentage'] = df_cleaned['female_male_ratio'].apply(extract_female_ratio) 32 | 33 | # Convert relevant score columns to numeric (handle '-' as NaN) 34 | score_columns = ['teaching', 'international', 'research', 'citations', 'income', 'total_score'] 35 | df_cleaned[score_columns] = df_cleaned[score_columns].replace('-', pd.NA).apply(pd.to_numeric) 36 | 37 | # Summary statistics 38 | summary = df_cleaned.describe(include='all') 39 | 40 | # Plot: Top 10 countries by number of universities 41 | plt.figure(figsize=(10, 6)) 42 | top_countries = df_cleaned['country'].value_counts().head(10) 43 | sns.barplot(x=top_countries.values, y=top_countries.index,hue=top_countries.index, palette="cubehelix") 44 | plt.title("Top 10 Countries by Number of Universities in the Dataset") 45 | plt.xlabel("Number of Universities") 46 | plt.ylabel("Country") 47 | print(plt.show()) 48 | 49 | # Bar Plot: Score distributions over years 50 | plt.figure(figsize=(10, 6)) 51 | sns.barplot(data=df_cleaned, x="year", y="total_score") 52 | plt.title("Distribution of Total Scores Over the Years") 53 | plt.ylabel("Total Score") 54 | plt.xlabel("Year") 55 | print(plt.show()) 56 | 57 | # Scatter Plot: Research vs. Teaching 58 | plt.figure(figsize=(8, 6)) 59 | sns.scatterplot(data=df_cleaned, x='research', y='teaching', hue='country', alpha=0.6, legend=False) 60 | plt.title("Research vs. Teaching Scores") 61 | plt.xlabel("Research Score") 62 | plt.ylabel("Teaching Score") 63 | print(plt.show()) 64 | 65 | # Yearly trend: Average total score 66 | plt.figure(figsize=(10, 5)) 67 | yearly_avg = df_cleaned.groupby("year")["total_score"].mean() 68 | sns.lineplot(x=yearly_avg.index, y=yearly_avg.values, marker='o', color='green') 69 | plt.title("Average Total Score Over the Years") 70 | plt.ylabel("Average Total Score") 71 | plt.xlabel("Year") 72 | print(plt.show()) 73 | 74 | # Correlation between research and citations 75 | plt.figure(figsize=(6, 6)) 76 | sns.regplot(data=df_cleaned, x="research", y="citations", scatter_kws={"alpha":0.3}) 77 | plt.title("Correlation Between Research and Citations") 78 | plt.xlabel("Research Score") 79 | plt.ylabel("Citations Score")# Top 5 universities in 3 selected countries 80 | latest_year = df_cleaned['year'].max() 81 | countries = ['United States of America', 'United Kingdom', 'Canada'] 82 | for country in countries: 83 | top_uni = df_cleaned[(df_cleaned['country'] == country) & (df_cleaned['year'] == latest_year)].nlargest(5, 'total_score') 84 | plt.figure(figsize=(9, 5)) 85 | sns.barplot(data=top_uni, x='total_score', y='university_name', palette='magma') 86 | plt.title(f"Top 5 Universities in {country} ({latest_year})") 87 | plt.xlabel("Total Score") 88 | plt.ylabel("University") 89 | print(plt.show()) 90 | 91 | # Top 5 universities in 3 selected countries 92 | countries = ['United States of America', 'United Kingdom', 'Canada'] 93 | for country in countries: 94 | top_uni = df_cleaned[(df_cleaned['country'] == country) & (df_cleaned['year'] == latest_year)].nlargest(5, 'total_score') 95 | plt.figure(figsize=(9, 5)) 96 | sns.barplot(data=top_uni, x='total_score', y='university_name', palette='magma') 97 | plt.title(f"Top 5 Universities in {country} ({latest_year})") 98 | plt.xlabel("Total Score") 99 | plt.ylabel("University") 100 | print(plt.show()) 101 | 102 | # Count of universities by region (basic example using 'country') 103 | plt.figure(figsize=(12, 6)) 104 | sns.countplot(data=df_cleaned[df_cleaned['year'] == latest_year], y='country', order=df_cleaned[df_cleaned['year'] == latest_year]['country'].value_counts().head(10).index, palette='Set2') 105 | plt.title("University Count by Country ({latest_year})") 106 | plt.xlabel("Count") 107 | plt.ylabel("Country") 108 | print(plt.show()) 109 | -------------------------------------------------------------------------------- /report_format.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/INDERJEET-KAUR-25/EDA-Python-project/07a1e6ee599b5f60299f8b443ab6b82512796662/report_format.doc --------------------------------------------------------------------------------