├── README.md
├── obj1.py
├── project.py
├── obj4.py
├── obj3.py
├── obj5.py
└── obj2.py


/README.md:
--------------------------------------------------------------------------------
1 | # Public-Libraries-Dataset-Analysis
2 | This contains the detailed dataset analysis of the public libraries dataset using Python And its libraries Numpy,Pandas,Matplotlib &amp; Seaborn
3 | 


--------------------------------------------------------------------------------
/obj1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import seaborn as sns
 4 | import numpy as np
 5 | df=pd.read_csv('Cleaned_Public_libraries.csv')
 6 | print(df)
 7 | 
 8 | df = df.rename(columns={
 9 |     'Population of Service Area': 'Population',
10 |     'Total Library Visits': 'Visits',
11 |     'Total Registered Borrowers': 'Borrowers'
12 | })
13 | 
14 | #scatter plot
15 | plt.figure(figsize=(8, 5))
16 | plt.scatter(df['Population'], df['Visits'], alpha=0.5, s=20, color='blue')
17 | plt.title('Library Visits vs Population Served')
18 | plt.xlabel('Population')
19 | plt.ylabel('Total Library Visits')
20 | plt.grid(True)
21 | plt.tight_layout()
22 | plt.show()
23 | 


--------------------------------------------------------------------------------
/project.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import seaborn as sns
 4 | import numpy as np
 5 | df=pd.read_csv('Public_libraries.csv')
 6 | # print(df)
 7 | # df.info()
 8 | # df.describe()
 9 | # print(df.isnull().sum())
10 | # df.head()
11 | basic_info = {
12 |     "shape": df.shape,
13 |     "null_values": df.isnull().sum(),
14 |     "data_types": df.dtypes,
15 |     "duplicates": df.duplicated().sum()
16 | }
17 | # Step 2: Data Cleaning & Preprocessing
18 | # Drop columns with over 25% missing data
19 | threshold = len(df) * 0.75
20 | df_cleaned = df.dropna(thresh=threshold, axis=1)
21 | # Drop rows where essential fields are missing
22 | essential_columns = ['Population of Service Area', 'Total Library Visits', 'Fiscal Year']
23 | df_cleaned = df_cleaned.dropna(subset=essential_columns)
24 | # Fill remaining numeric NaN with median
25 | numeric_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
26 | df_cleaned[numeric_cols] = df_cleaned[numeric_cols].fillna(df_cleaned[numeric_cols].median())
27 | # Standardize column names (optional: remove trailing spaces etc.)
28 | df_cleaned.columns = df_cleaned.columns.str.strip()
29 | df_cleaned.to_csv("Cleaned_Public_Libraries.csv", index=False)
30 | print(df_cleaned.head())
31 | 


--------------------------------------------------------------------------------
/obj4.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | # Load the dataset
 5 | df = pd.read_csv("Cleaned_Public_Libraries.csv")
 6 | import pandas as pd
 7 | import seaborn as sns
 8 | import matplotlib.pyplot as plt
 9 | 
10 | # Load the dataset
11 | df = pd.read_csv("Cleaned_Public_Libraries.csv")
12 | def remove_outliers_iqr(df, column, group_by):
13 |     cleaned_df = pd.DataFrame()
14 |     for group, subset in df.groupby(group_by):
15 |         Q1 = subset[column].quantile(0.25)
16 |         Q3 = subset[column].quantile(0.75)
17 |         IQR = Q3 - Q1
18 |         lower_bound = Q1 - 1.5 * IQR
19 |         upper_bound = Q3 + 1.5 * IQR
20 |         filtered = subset[(subset[column] >= lower_bound) & (subset[column] <= upper_bound)]
21 |         cleaned_df = pd.concat([cleaned_df, filtered], axis=0)
22 |     return cleaned_df
23 | df_no_outliers = remove_outliers_iqr(df, "Library Visits Per Capita Served", "County")
24 | # Set seaborn style
25 | sns.set(style="whitegrid")
26 | # Plot box plot without outliers
27 | plt.figure(figsize=(12, 6))
28 | sns.boxplot(
29 |     x="County",
30 |     y="Library Visits Per Capita Served",
31 |     data=df_no_outliers,
32 |     palette="Set2"
33 | )
34 | plt.title("Library Visits Per Capita by County (Outliers Removed)")
35 | plt.xlabel("County")
36 | plt.ylabel("Library Visits Per Capita")
37 | plt.xticks(rotation=45)
38 | plt.tight_layout()
39 | plt.show()


--------------------------------------------------------------------------------
/obj3.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | # Load the dataset
 5 | df = pd.read_csv("Cleaned_Public_Libraries.csv")
 6 | # ========== Heatmap of Correlations ==========
 7 | financial_service_cols = [
 8 |     "Operating Income Per Capita",
 9 |     "Operating Expenditures Per Capita",
10 |     "Library Visits Per Capita Served",
11 |     "Circulation Per Capita Served"
12 | ]
13 | correlation_matrix = df[financial_service_cols].corr()
14 | plt.figure(figsize=(8, 6))
15 | sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu", fmt=".2f")
16 | plt.title("Correlation: Financial Health vs. Service Output")
17 | plt.tight_layout()
18 | plt.show()
19 | # ========== Bar Charts: Top Libraries ==========
20 | # Top 10 libraries by Operating Income Per Capita
21 | top_income = df.groupby("Library")["Operating Income Per Capita"].mean().nlargest(10).sort_values()
22 | # Top 10 libraries by Visits Per Capita
23 | top_visits = df.groupby("Library")["Library Visits Per Capita Served"].mean().nlargest(10).sort_values()
24 | fig, axes = plt.subplots(1, 2, figsize=(16, 6))
25 | # Bar chart: Operating Income
26 | axes[0].barh(top_income.index, top_income.values, color="teal")
27 | axes[0].set_title("Top 10 Libraries by Avg. Operating Income Per Capita")
28 | axes[0].set_xlabel("Operating Income Per Capita")
29 | # Bar chart: Library Visits
30 | axes[1].barh(top_visits.index, top_visits.values, color="coral")
31 | axes[1].set_title("Top 10 Libraries by Avg. Visits Per Capita")
32 | axes[1].set_xlabel("Library Visits Per Capita")
33 | plt.tight_layout()
34 | plt.show()
35 | 


--------------------------------------------------------------------------------
/obj5.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | # Load the dataset
 4 | df = pd.read_csv("Cleaned_Public_Libraries.csv")
 5 | # ===== Bar Chart: Percent with Library Cards vs Reference Questions =====
 6 | plt.figure(figsize=(12, 6))
 7 | plt.bar(df["Library"], df["Percent of Residents with Library Cards"], label="Library Card Holders (%)", color='skyblue')
 8 | plt.plot(df["Library"], df["Reference Questions"], label="Reference Questions", color='coral', marker='o')
 9 | plt.xticks(rotation=90)
10 | plt.title("Library Card Holders vs Reference Questions")
11 | plt.xlabel("Library")
12 | plt.ylabel("Engagement / Questions")
13 | plt.legend()
14 | plt.tight_layout()
15 | plt.show()
16 | # ===== Stacked Bar: Programs, Reference Questions, Program Views =====
17 | stacked_df = df[["Library", "Total Programs (Synchronous + Prerecorded)", "Reference Questions", "Total Program Attendance & Views"]]
18 | stacked_df.set_index("Library", inplace=True)
19 | stacked_df = stacked_df.head(10)  # Limit for visualization
20 | stacked_df.plot(kind="bar", stacked=True, figsize=(14, 6), colormap="Paired")
21 | plt.title("Breakdown of User Engagement by Library (Top 10)")
22 | plt.xlabel("Library")
23 | plt.ylabel("Total Interactions")
24 | plt.xticks(rotation=45)
25 | plt.tight_layout()
26 | plt.show()
27 | # ===== Pie Chart: Total Proportions of Engagement Types =====
28 | total_programs = df["Total Programs (Synchronous + Prerecorded)"].sum()
29 | total_questions = df["Reference Questions"].sum()
30 | total_views = df["Total Program Attendance & Views"].sum()
31 | labels = ["Programs", "Reference Questions", "Program Attendance & Views"]
32 | sizes = [total_programs, total_questions, total_views]
33 | colors = ['lightgreen', 'gold', 'lightskyblue']
34 | plt.figure(figsize=(7, 7))
35 | plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
36 | plt.title("Overall Breakdown of Engagement Types")
37 | plt.tight_layout()
38 | plt.show()
39 | 


--------------------------------------------------------------------------------
/obj2.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | import seaborn as sns
 4 | # Load your dataset
 5 | df = pd.read_csv("Cleaned_Public_Libraries.csv")
 6 | # Set visual style
 7 | sns.set(style="whitegrid")
 8 | # --- EDA for Trends Over Time ---
 9 | print("Unique Fiscal Years:", df["Fiscal Year"].nunique())
10 | print("Fiscal Year Range:", df["Fiscal Year"].min(), "to", df["Fiscal Year"].max())
11 | # Aggregate key metrics by Fiscal Year
12 | agg_df = df.groupby("Fiscal Year")[[
13 |     "Total Circulation",
14 |     "Total Program Attendance & Views"
15 | ]].sum().reset_index()
16 | # Summary Statistics
17 | print("\nSummary Statistics:")
18 | print(agg_df.describe())
19 | # Line Plots
20 | plt.figure(figsize=(14, 6))
21 | plt.plot(agg_df["Fiscal Year"], agg_df["Total Circulation"], marker='o', label="Total Circulation")
22 | plt.plot(agg_df["Fiscal Year"], agg_df["Total Program Attendance & Views"], marker='s', label="Program Attendance & Views")
23 | plt.title("Library Services Trends Over Time", fontsize=16)
24 | plt.xlabel("Fiscal Year")
25 | plt.ylabel("Total Metrics")
26 | plt.legend()
27 | plt.grid(True)
28 | plt.tight_layout()
29 | plt.show()
30 | # Percent Change Analysis
31 | pct_change = agg_df.set_index("Fiscal Year").pct_change() * 100
32 | print("\nYear-over-Year Percentage Change:")
33 | print(pct_change)
34 | df["Circulation Per Capita"] = df["Total Circulation"] / df["Population of Service Area"]
35 | df["Programs Per Capita"] = df["Total Program Attendance & Views"] / df["Population of Service Area"]
36 | per_capita_df = df.groupby("Fiscal Year")[["Circulation Per Capita", "Programs Per Capita"]].mean().reset_index()
37 | # Line plot for per capita trends
38 | plt.figure(figsize=(14, 6))
39 | plt.plot(per_capita_df["Fiscal Year"], per_capita_df["Circulation Per Capita"], marker='o', label="Circulation Per Capita")
40 | plt.plot(per_capita_df["Fiscal Year"], per_capita_df["Programs Per Capita"], marker='s', label="Programs Per Capita")
41 | plt.title("Per Capita Library Service Trends", fontsize=16)
42 | plt.xlabel("Fiscal Year")
43 | plt.ylabel("Average Per Capita Value")
44 | plt.legend()
45 | plt.grid(True)
46 | plt.tight_layout()
47 | plt.show()
48 | 
49 | 
50 | 
51 | 
52 | # Total Circulation increased during early years but declined in recent years, suggesting reduced reliance on physical materials.
53 | 
54 | # Total Program Attendance & Views saw a sharp rise post-2020, reflecting the shift to virtual programming during and after the pandemic.
55 | 
56 | # The contrast in trends indicates a transition in library services from traditional circulation to engagement through programs and digital content.
57 | 
58 | # Per capita analysis shows declining circulation rates but steady or rising program participation, pointing to changing user preferences.
59 | 
60 | # Libraries appear to be adapting effectively to modern demands by investing more in community-focused and digital services.


--------------------------------------------------------------------------------