├── Public_Libraries.csv └── python project.py /python project.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | 7 | df = pd.read_csv('Public_Libraries.csv') 8 | 9 | print("Initial Data Info:") 10 | print(df.info()) 11 | 12 | df = df.replace('', np.nan) 13 | numeric_columns = [ 14 | 'Population of Service Area', 'Total Library Visits', 'Library Visits Per Capita Served', 15 | 'Total Registered Borrowers', 'Percent of Residents with Library Cards', 'Total Circulation', 16 | 'Circulation Per Capita Served', 'Total Program Attendance & Views', 17 | 'Total Program Attendance & Views Per Capita Served', 'Total Operating Income', 18 | 'Operating Income Per Capita', 'Library Materials Expenditures', 19 | 'Reference Questions', 'Reference Questions Per Capita Served' 20 | ] 21 | 22 | for col in numeric_columns: 23 | df[col] = pd.to_numeric(df[col], errors='coerce') 24 | 25 | for col in numeric_columns: 26 | df[col] = df[col].fillna(df[col].median()) 27 | 28 | # Drop rows with essential missing data 29 | df = df.dropna(subset=['Library', 'County', 'Fiscal Year', 'Principal Public?']) 30 | 31 | # Drop columns that are completely NaN 32 | df = df.drop(columns=['Registrations Per Capita Served'], errors='ignore') 33 | 34 | # Drop duplicate rows 35 | df = df.drop_duplicates() 36 | 37 | # Standardize text formatting in key columns 38 | df['Library'] = df['Library'].str.strip().str.title() 39 | df['County'] = df['County'].str.strip().str.title() 40 | df['Principal Public?'] = df['Principal Public?'].str.strip().str.title() 41 | 42 | 43 | df['Fiscal Year'] = df['Fiscal Year'].astype(int) 44 | 45 | # --- Step 2: IQR Method --- 46 | def remove_outliers(df, column): 47 | Q1 = df[column].quantile(0.25) 48 | Q3 = df[column].quantile(0.75) 49 | IQR = Q3 - Q1 50 | lower = Q1 - 1.5 * IQR 51 | upper = Q3 + 1.5 * IQR 52 | return df[(df[column] >= lower) & (df[column] <= upper)] 53 | 54 | outlier_cols = [ 55 | 'Library Visits Per Capita Served', 'Circulation Per Capita Served', 56 | 'Total Program Attendance & Views Per Capita Served', 'Operating Income Per Capita', 57 | 'Percent of Residents with Library Cards', 'Library Materials Expenditures', 58 | 'Reference Questions Per Capita Served' 59 | ] 60 | for col in outlier_cols: 61 | df = remove_outliers(df, col) 62 | 63 | print("\nAfter Cleaning Data Info:") 64 | print(df.info()) 65 | ###------------------------------------------------------------------OBJ 1------------------------------------------------------------------------------------- 66 | ### Objective 1: Library Visits Trends Over Time 67 | ## 68 | ### 1.1: Area Plot 69 | ##plt.figure(figsize=(12, 6)) 70 | ##visits_trend = df.groupby('Fiscal Year')['Library Visits Per Capita Served'].mean() 71 | ##plt.fill_between(visits_trend.index, visits_trend.values, color='skyblue', alpha=0.6) 72 | ##plt.plot(visits_trend.index, visits_trend.values, color='navy', linewidth=2) 73 | ##plt.title('Average Library Visits Per Capita (1996-2023)', fontsize=14, pad=15) 74 | ##plt.xlabel('Fiscal Year', fontsize=12) 75 | ##plt.ylabel('Visits Per Capita', fontsize=12) 76 | ##plt.grid(True, linestyle='--', alpha=0.7) 77 | ##plt.show() 78 | ## 79 | ### Subpart 2: Library-Level Patterns 80 | ### 1.2: Heatmap 81 | ##plt.figure(figsize=(12, 8)) 82 | ### Focus on top 20 libraries to avoid overcrowding 83 | ##top_libraries = df.groupby('Library')['Library Visits Per Capita Served'].mean().nlargest(20).index 84 | ##pivot_library = df[df['Library'].isin(top_libraries)].pivot_table( 85 | ## values='Library Visits Per Capita Served', index='Library', columns='Fiscal Year', aggfunc='mean') 86 | ##sns.heatmap(pivot_library, cmap='YlGnBu', annot=False, cbar_kws={'label': 'Visits Per Capita'}) 87 | ##plt.title('Library Visits Per Capita by Library and Year', fontsize=14, pad=15) 88 | ##plt.xlabel('Fiscal Year', fontsize=12) 89 | ##plt.ylabel('Library', fontsize=12) 90 | ##plt.show() 91 | ## 92 | ###------------------------------------------------------Obje 2 ------------------------------------------------------------------------- 93 | ### Objective 2: Circulation vs Population 94 | ###2.1: Scatter Plot: 95 | ##plt.figure(figsize=(10, 6)) 96 | ##sns.scatterplot( 97 | ## x='Population of Service Area', y='Circulation Per Capita Served', 98 | ## size='Total Circulation', hue='County', data=df, alpha=0.6 99 | ##) 100 | ##plt.title('Circulation Per Capita vs Population Served') 101 | ##plt.tight_layout() 102 | ##plt.savefig('circulation_scatter.png') 103 | ##plt.show() 104 | ## 105 | ## 106 | ## 107 | ### Subpart 2: Circulation by Library Type 108 | ### 2.2: Bar Plot with Error Bars 109 | ##plt.figure(figsize=(10, 6)) 110 | ##means = df.groupby('Principal Public?')['Circulation Per Capita Served'].mean() 111 | ##stds = df.groupby('Principal Public?')['Circulation Per Capita Served'].std() 112 | ##plt.bar(means.index, means.values, yerr=stds.values, color=['skyblue', 'salmon'], capsize=5) 113 | ##plt.title('Average Circulation Per Capita by Library Type', fontsize=14, pad=15) 114 | ##plt.xlabel('Principal Public?', fontsize=12) 115 | ##plt.ylabel('Circulation Per Capita', fontsize=12) 116 | ##plt.show() 117 | ## 118 | ###------------------------------------------------------Obj 3-------------------------------------------------------------------------- 119 | ### 3. Bar Plot: Program Attendance Per Capita by County 120 | ###3.1 121 | ##plt.figure(figsize=(12, 6)) 122 | ##attendance = df.groupby('County')['Total Program Attendance & Views Per Capita Served'].mean().sort_values() 123 | ##sns.barplot(x=attendance.values, y=attendance.index, hue=attendance.index, palette='viridis', legend=False) 124 | ##plt.title('Average Program Attendance Per Capita by County') 125 | ##plt.tight_layout() 126 | ##plt.savefig('program_attendance_bar.png') 127 | ##plt.show() 128 | ## 129 | ###-----------------------------------------------------------Obje 4 ------------------------------------------------------- 130 | ### Objective 4: Funding Impact 131 | ### Subpart 1: Funding vs. Circulation 132 | ### 4.1: Binned Scatter Plot 133 | ##plt.figure(figsize=(10, 6)) 134 | ##df['Income Quartile'] = pd.qcut(df['Operating Income Per Capita'], 4, labels=['Q1', 'Q2', 'Q3', 'Q4']) 135 | ##sns.scatterplot(x='Income Quartile', y='Circulation Per Capita Served', 136 | ## hue='Income Quartile', data=df, palette='Blues', alpha=0.6, legend=False) 137 | ##plt.title('Circulation Per Capita by Income Quartile', fontsize=14, pad=15) 138 | ##plt.xlabel('Operating Income Quartile', fontsize=12) 139 | ##plt.ylabel('Circulation Per Capita', fontsize=12) 140 | ##plt.show() 141 | ## 142 | ### 4.3 Pair Plot: Income vs Performance 143 | ##corr_cols = [ 144 | ## 'Operating Income Per Capita', 'Library Visits Per Capita Served', 145 | ## 'Circulation Per Capita Served', 'Total Program Attendance & Views Per Capita Served' 146 | ##] 147 | ##sns.pairplot(df[corr_cols], diag_kind='kde', plot_kws={'alpha': 0.6, 'color': 'purple'}) 148 | ##plt.suptitle('Funding vs Performance Metrics', y=1.02) 149 | ##plt.savefig('funding_pairplot.png') 150 | ##plt.show() 151 | ###-----------------------------------------------------------------------------Obj 5 -------------------------------------------------------------------------- 152 | ### 5. Box Plot: Top 10 Libraries by Circulation 153 | ##top_libraries = df.groupby('Library')['Circulation Per Capita Served'].mean().nlargest(10).index 154 | ##top_df = df[df['Library'].isin(top_libraries)] 155 | ##plt.figure(figsize=(12, 6)) 156 | ##sns.boxplot(x='Library', y='Circulation Per Capita Served', hue='Library', data=top_df, palette='Set2', legend=False) 157 | ##plt.title('Top 10 Libraries: Circulation Per Capita') 158 | ##plt.xticks(rotation=45) 159 | ##plt.show() 160 | ## 161 | ###--------------------------------------------------------------------------Obj 6------------------------------------------------------------------------------ 162 | ### 6. Pie Chart: County Share of Total Circulation 163 | ##plt.figure(figsize=(10, 8)) 164 | ##county_circulation = df.groupby('County')['Total Circulation'].sum() 165 | ##plt.pie(county_circulation, labels=county_circulation.index, autopct='%1.1f%%', startangle=140, 166 | ## colors=sns.color_palette('pastel')) 167 | ##plt.title('Total Circulation Share by County') 168 | ##plt.show() 169 | ## 170 | ###7. KDE Plot: Principal vs Non-Principal Library Performance 171 | ##plt.figure(figsize=(12, 6)) 172 | ##for status in df['Principal Public?'].unique(): 173 | ## sns.kdeplot( 174 | ## data=df[df['Principal Public?'] == status], 175 | ## x='Circulation Per Capita Served', 176 | ## label=status, fill=True, alpha=0.4 177 | ## ) 178 | ##plt.title('Principal vs Non-Principal Libraries: Circulation Per Capita') 179 | ##plt.legend() 180 | ##plt.tight_layout() 181 | ##plt.savefig('principal_vs_nonprincipal_kde.png') 182 | ##plt.show() 183 | ## 184 | ###Summary 185 | ##print("\nSummary Statistics:") 186 | ##print(df[ 187 | ## ['Library Visits Per Capita Served', 'Circulation Per Capita Served', 188 | ## 'Total Program Attendance & Views Per Capita Served', 'Operating Income Per Capita', 189 | ## 'Percent of Residents with Library Cards', 'Reference Questions Per Capita Served'] 190 | ##].describe()) 191 | ## 192 | ### --- Step 5: Save Cleaned Data --- 193 | ##df.to_csv('cleaned_library_data.csv', index=False) 194 | ##print("\nCleaned dataset saved as 'cleaned_library_data.csv'") 195 | --------------------------------------------------------------------------------