├── README.md ├── obj1.py ├── project.py ├── obj4.py ├── obj3.py ├── obj5.py └── obj2.py /README.md: -------------------------------------------------------------------------------- 1 | # Public-Libraries-Dataset-Analysis 2 | This contains the detailed dataset analysis of the public libraries dataset using Python And its libraries Numpy,Pandas,Matplotlib & Seaborn 3 | -------------------------------------------------------------------------------- /obj1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | import numpy as np 5 | df=pd.read_csv('Cleaned_Public_libraries.csv') 6 | print(df) 7 | 8 | df = df.rename(columns={ 9 | 'Population of Service Area': 'Population', 10 | 'Total Library Visits': 'Visits', 11 | 'Total Registered Borrowers': 'Borrowers' 12 | }) 13 | 14 | #scatter plot 15 | plt.figure(figsize=(8, 5)) 16 | plt.scatter(df['Population'], df['Visits'], alpha=0.5, s=20, color='blue') 17 | plt.title('Library Visits vs Population Served') 18 | plt.xlabel('Population') 19 | plt.ylabel('Total Library Visits') 20 | plt.grid(True) 21 | plt.tight_layout() 22 | plt.show() 23 | -------------------------------------------------------------------------------- /project.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | import numpy as np 5 | df=pd.read_csv('Public_libraries.csv') 6 | # print(df) 7 | # df.info() 8 | # df.describe() 9 | # print(df.isnull().sum()) 10 | # df.head() 11 | basic_info = { 12 | "shape": df.shape, 13 | "null_values": df.isnull().sum(), 14 | "data_types": df.dtypes, 15 | "duplicates": df.duplicated().sum() 16 | } 17 | # Step 2: Data Cleaning & Preprocessing 18 | # Drop columns with over 25% missing data 19 | threshold = len(df) * 0.75 20 | df_cleaned = df.dropna(thresh=threshold, axis=1) 21 | # Drop rows where essential fields are missing 22 | essential_columns = ['Population of Service Area', 'Total Library Visits', 'Fiscal Year'] 23 | df_cleaned = df_cleaned.dropna(subset=essential_columns) 24 | # Fill remaining numeric NaN with median 25 | numeric_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns 26 | df_cleaned[numeric_cols] = df_cleaned[numeric_cols].fillna(df_cleaned[numeric_cols].median()) 27 | # Standardize column names (optional: remove trailing spaces etc.) 28 | df_cleaned.columns = df_cleaned.columns.str.strip() 29 | df_cleaned.to_csv("Cleaned_Public_Libraries.csv", index=False) 30 | print(df_cleaned.head()) 31 | -------------------------------------------------------------------------------- /obj4.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | # Load the dataset 5 | df = pd.read_csv("Cleaned_Public_Libraries.csv") 6 | import pandas as pd 7 | import seaborn as sns 8 | import matplotlib.pyplot as plt 9 | 10 | # Load the dataset 11 | df = pd.read_csv("Cleaned_Public_Libraries.csv") 12 | def remove_outliers_iqr(df, column, group_by): 13 | cleaned_df = pd.DataFrame() 14 | for group, subset in df.groupby(group_by): 15 | Q1 = subset[column].quantile(0.25) 16 | Q3 = subset[column].quantile(0.75) 17 | IQR = Q3 - Q1 18 | lower_bound = Q1 - 1.5 * IQR 19 | upper_bound = Q3 + 1.5 * IQR 20 | filtered = subset[(subset[column] >= lower_bound) & (subset[column] <= upper_bound)] 21 | cleaned_df = pd.concat([cleaned_df, filtered], axis=0) 22 | return cleaned_df 23 | df_no_outliers = remove_outliers_iqr(df, "Library Visits Per Capita Served", "County") 24 | # Set seaborn style 25 | sns.set(style="whitegrid") 26 | # Plot box plot without outliers 27 | plt.figure(figsize=(12, 6)) 28 | sns.boxplot( 29 | x="County", 30 | y="Library Visits Per Capita Served", 31 | data=df_no_outliers, 32 | palette="Set2" 33 | ) 34 | plt.title("Library Visits Per Capita by County (Outliers Removed)") 35 | plt.xlabel("County") 36 | plt.ylabel("Library Visits Per Capita") 37 | plt.xticks(rotation=45) 38 | plt.tight_layout() 39 | plt.show() -------------------------------------------------------------------------------- /obj3.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | # Load the dataset 5 | df = pd.read_csv("Cleaned_Public_Libraries.csv") 6 | # ========== Heatmap of Correlations ========== 7 | financial_service_cols = [ 8 | "Operating Income Per Capita", 9 | "Operating Expenditures Per Capita", 10 | "Library Visits Per Capita Served", 11 | "Circulation Per Capita Served" 12 | ] 13 | correlation_matrix = df[financial_service_cols].corr() 14 | plt.figure(figsize=(8, 6)) 15 | sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu", fmt=".2f") 16 | plt.title("Correlation: Financial Health vs. Service Output") 17 | plt.tight_layout() 18 | plt.show() 19 | # ========== Bar Charts: Top Libraries ========== 20 | # Top 10 libraries by Operating Income Per Capita 21 | top_income = df.groupby("Library")["Operating Income Per Capita"].mean().nlargest(10).sort_values() 22 | # Top 10 libraries by Visits Per Capita 23 | top_visits = df.groupby("Library")["Library Visits Per Capita Served"].mean().nlargest(10).sort_values() 24 | fig, axes = plt.subplots(1, 2, figsize=(16, 6)) 25 | # Bar chart: Operating Income 26 | axes[0].barh(top_income.index, top_income.values, color="teal") 27 | axes[0].set_title("Top 10 Libraries by Avg. Operating Income Per Capita") 28 | axes[0].set_xlabel("Operating Income Per Capita") 29 | # Bar chart: Library Visits 30 | axes[1].barh(top_visits.index, top_visits.values, color="coral") 31 | axes[1].set_title("Top 10 Libraries by Avg. Visits Per Capita") 32 | axes[1].set_xlabel("Library Visits Per Capita") 33 | plt.tight_layout() 34 | plt.show() 35 | -------------------------------------------------------------------------------- /obj5.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | # Load the dataset 4 | df = pd.read_csv("Cleaned_Public_Libraries.csv") 5 | # ===== Bar Chart: Percent with Library Cards vs Reference Questions ===== 6 | plt.figure(figsize=(12, 6)) 7 | plt.bar(df["Library"], df["Percent of Residents with Library Cards"], label="Library Card Holders (%)", color='skyblue') 8 | plt.plot(df["Library"], df["Reference Questions"], label="Reference Questions", color='coral', marker='o') 9 | plt.xticks(rotation=90) 10 | plt.title("Library Card Holders vs Reference Questions") 11 | plt.xlabel("Library") 12 | plt.ylabel("Engagement / Questions") 13 | plt.legend() 14 | plt.tight_layout() 15 | plt.show() 16 | # ===== Stacked Bar: Programs, Reference Questions, Program Views ===== 17 | stacked_df = df[["Library", "Total Programs (Synchronous + Prerecorded)", "Reference Questions", "Total Program Attendance & Views"]] 18 | stacked_df.set_index("Library", inplace=True) 19 | stacked_df = stacked_df.head(10) # Limit for visualization 20 | stacked_df.plot(kind="bar", stacked=True, figsize=(14, 6), colormap="Paired") 21 | plt.title("Breakdown of User Engagement by Library (Top 10)") 22 | plt.xlabel("Library") 23 | plt.ylabel("Total Interactions") 24 | plt.xticks(rotation=45) 25 | plt.tight_layout() 26 | plt.show() 27 | # ===== Pie Chart: Total Proportions of Engagement Types ===== 28 | total_programs = df["Total Programs (Synchronous + Prerecorded)"].sum() 29 | total_questions = df["Reference Questions"].sum() 30 | total_views = df["Total Program Attendance & Views"].sum() 31 | labels = ["Programs", "Reference Questions", "Program Attendance & Views"] 32 | sizes = [total_programs, total_questions, total_views] 33 | colors = ['lightgreen', 'gold', 'lightskyblue'] 34 | plt.figure(figsize=(7, 7)) 35 | plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140) 36 | plt.title("Overall Breakdown of Engagement Types") 37 | plt.tight_layout() 38 | plt.show() 39 | -------------------------------------------------------------------------------- /obj2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | # Load your dataset 5 | df = pd.read_csv("Cleaned_Public_Libraries.csv") 6 | # Set visual style 7 | sns.set(style="whitegrid") 8 | # --- EDA for Trends Over Time --- 9 | print("Unique Fiscal Years:", df["Fiscal Year"].nunique()) 10 | print("Fiscal Year Range:", df["Fiscal Year"].min(), "to", df["Fiscal Year"].max()) 11 | # Aggregate key metrics by Fiscal Year 12 | agg_df = df.groupby("Fiscal Year")[[ 13 | "Total Circulation", 14 | "Total Program Attendance & Views" 15 | ]].sum().reset_index() 16 | # Summary Statistics 17 | print("\nSummary Statistics:") 18 | print(agg_df.describe()) 19 | # Line Plots 20 | plt.figure(figsize=(14, 6)) 21 | plt.plot(agg_df["Fiscal Year"], agg_df["Total Circulation"], marker='o', label="Total Circulation") 22 | plt.plot(agg_df["Fiscal Year"], agg_df["Total Program Attendance & Views"], marker='s', label="Program Attendance & Views") 23 | plt.title("Library Services Trends Over Time", fontsize=16) 24 | plt.xlabel("Fiscal Year") 25 | plt.ylabel("Total Metrics") 26 | plt.legend() 27 | plt.grid(True) 28 | plt.tight_layout() 29 | plt.show() 30 | # Percent Change Analysis 31 | pct_change = agg_df.set_index("Fiscal Year").pct_change() * 100 32 | print("\nYear-over-Year Percentage Change:") 33 | print(pct_change) 34 | df["Circulation Per Capita"] = df["Total Circulation"] / df["Population of Service Area"] 35 | df["Programs Per Capita"] = df["Total Program Attendance & Views"] / df["Population of Service Area"] 36 | per_capita_df = df.groupby("Fiscal Year")[["Circulation Per Capita", "Programs Per Capita"]].mean().reset_index() 37 | # Line plot for per capita trends 38 | plt.figure(figsize=(14, 6)) 39 | plt.plot(per_capita_df["Fiscal Year"], per_capita_df["Circulation Per Capita"], marker='o', label="Circulation Per Capita") 40 | plt.plot(per_capita_df["Fiscal Year"], per_capita_df["Programs Per Capita"], marker='s', label="Programs Per Capita") 41 | plt.title("Per Capita Library Service Trends", fontsize=16) 42 | plt.xlabel("Fiscal Year") 43 | plt.ylabel("Average Per Capita Value") 44 | plt.legend() 45 | plt.grid(True) 46 | plt.tight_layout() 47 | plt.show() 48 | 49 | 50 | 51 | 52 | # Total Circulation increased during early years but declined in recent years, suggesting reduced reliance on physical materials. 53 | 54 | # Total Program Attendance & Views saw a sharp rise post-2020, reflecting the shift to virtual programming during and after the pandemic. 55 | 56 | # The contrast in trends indicates a transition in library services from traditional circulation to engagement through programs and digital content. 57 | 58 | # Per capita analysis shows declining circulation rates but steady or rising program participation, pointing to changing user preferences. 59 | 60 | # Libraries appear to be adapting effectively to modern demands by investing more in community-focused and digital services. --------------------------------------------------------------------------------