├── DDW-2000C-08.xlsx ├── README.md └── mainFile.py /DDW-2000C-08.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajsinghpiyush/Literacy-Rate-in-Jharkhand-by-age-and-gender/32a8d7a2684778918bdb8706d56a783b71acedee/DDW-2000C-08.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Literacy-Rate-in-Jharkhand-by-age-and-gender 2 | This dataset from the Census of India provides detailed information about educational attainment levels across different age groups and genders in Jharkhand state. The data is broken down by rural/urban areas and includes categories from illiteracy through graduate education. 3 | 2nd Day 4 | changed 5th column to header column 5 | 3rd day 6 | renamed columns and did simple eda 7 | -------------------------------------------------------------------------------- /mainFile.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | 7 | df = pd.read_excel("C:\\Python CA2\\DDW-2000C-08.xlsx") 8 | 9 | 10 | print(df.head(15)) 11 | print(df.describe()) 12 | print(df.info()) 13 | 14 | 15 | 16 | df.dropna(how='all', inplace=True) 17 | df.dropna(axis=1, how='all', inplace = True) 18 | 19 | columns = [ 20 | 'Table_Name', 'State_Code', 'Distt_Code', 'Area_Name', 'Total_Rural_Urban', 21 | 'Age_group', 'Persons_Total', 'Males_Total', 'Females_Total', 22 | 'Persons_Illiterate', 'Males_Illiterate', 'Females_Illiterate', 23 | 'Persons_Literate', 'Males_Literate', 'Females_Literate', 24 | 'Persons_Literate_wo_Edu', 'Males_Literate_wo_Edu', 'Females_Literate_wo_Edu', 25 | 'Persons_Below_Primary', 'Males_Below_Primary', 'Females_Below_Primary', 26 | 'Persons_Primary', 'Males_Primary', 'Females_Primary', 27 | 'Persons_Middle', 'Males_Middle', 'Females_Middle', 28 | 'Persons_Matric_Secondary', 'Males_Matric_Secondary', 'Females_Matric_Secondary', 29 | 'Persons_Higher_Secondary', 'Males_Higher_Secondary', 'Females_Higher_Secondary', 30 | 'Persons_NonTech_Diploma', 'Males_NonTech_Diploma', 'Females_NonTech_Diploma', 31 | 'Persons_Tech_Diploma', 'Males_Tech_Diploma', 'Females_Tech_Diploma', 32 | 'Persons_Graduate', 'Males_Graduate', 'Females_Graduate', 33 | 'Persons_Unclassified', 'Males_Unclassified', 'Females_Unclassified' 34 | ] 35 | 36 | df.columns = columns 37 | # print("Hi",df.head) 38 | # print(df.info()) 39 | # print(df.describe()) 40 | 41 | numeric_cols = df.columns[6:] 42 | df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce') 43 | print(df['Age_group']) 44 | 45 | df.fillna(0, inplace=True) 46 | df = df[(df[numeric_cols] != 0).any(axis=1)] 47 | 48 | df['Area_Type'] = df['Area_Name'].apply(lambda x: x.split(' - ')[1] if ' - ' in str(x) else np.nan) 49 | df['District'] = df['Area_Name'].apply(lambda x: x.split(' - ')[-1].replace('District - ', '') if ' - ' in str(x) and str(x) != 'State - JHARKHAND' else np.nan) 50 | 51 | 52 | 53 | plt.figure(figsize=(14, 6)) 54 | total_pop = df[df['Age_group'] == 'All ages'].groupby('District')['Persons_Total'].sum().sort_values(ascending=False) 55 | total_pop.plot(kind='bar') 56 | plt.title('Total Population by District') 57 | plt.ylabel('Population Count') 58 | plt.xticks(rotation=45) 59 | plt.tight_layout() 60 | plt.show() 61 | 62 | 63 | 64 | gender_df = df[df['Age_group'] == 'All ages'].groupby('District')[['Males_Total', 'Females_Total']].sum() 65 | fig, ax = plt.subplots(figsize=(12, 6)) 66 | gender_df.plot(kind='bar', ax=ax) 67 | ax.set_title('Gender Distribution by District') 68 | ax.set_ylabel('Population Count') 69 | ax.set_xlabel('District') 70 | ax.set_xticklabels(gender_df.index, rotation=45) 71 | plt.tight_layout() 72 | plt.show() 73 | 74 | 75 | lit_df = df[df['Age_group'] == 'All ages'].groupby('District')[['Persons_Illiterate', 'Persons_Literate']].sum() 76 | fig, ax = plt.subplots(figsize=(12, 6)) 77 | lit_df.plot(kind='bar', ax=ax) 78 | ax.set_title('Literacy Status by District') 79 | ax.set_ylabel('Population Count') 80 | ax.set_xlabel('District') 81 | ax.set_xticklabels(lit_df.index, rotation=45) 82 | plt.tight_layout() 83 | plt.show() 84 | 85 | edu_cols = ['Persons_Below_Primary', 'Persons_Primary', 'Persons_Middle', 86 | 'Persons_Matric_Secondary', 'Persons_Higher_Secondary', 'Persons_Graduate'] 87 | edu_df = df[df['Age_group'] == 'All ages'].groupby('District')[edu_cols].sum() 88 | fig, ax = plt.subplots(figsize=(14, 8)) 89 | edu_df.plot(kind='bar', ax=ax) 90 | ax.set_title('Education Level Distribution by District') 91 | ax.set_ylabel('Population Count') 92 | ax.set_xlabel('District') 93 | ax.set_xticklabels(edu_df.index, rotation=45) 94 | plt.tight_layout() 95 | plt.show() 96 | 97 | if 'Total_Rural_Urban' in df.columns: 98 | plt.figure(figsize=(12, 6)) 99 | rural_urban = df[(df['Age_group'] == 'All ages')& 100 | (~df['Total_Rural_Urban'].str.contains('Total', case=False, na=False))].groupby('Total_Rural_Urban')['Persons_Total'].sum() 101 | rural_urban.plot(kind='pie', autopct='%1.1f%%') 102 | plt.title('Population Distribution: Rural vs Urban') 103 | plt.ylabel('') 104 | plt.show() 105 | 106 | age_groups = ['7', '8', '9', '10-14', '15-19', '20-24', '25-29', '30-34', '35+'] 107 | age_df = df[df['Age_group'].isin(age_groups)] 108 | pivot_edu = age_df.pivot_table( 109 | index='Age_group', 110 | values=edu_cols, 111 | aggfunc='sum' 112 | ) 113 | gender_gap_cols = [ 114 | 'Males_Literate', 'Females_Literate', 115 | 'Males_Graduate', 'Females_Graduate', 116 | 'Males_Matric_Secondary', 'Females_Matric_Secondary' 117 | ] 118 | plt.figure(figsize=(12, 8)) 119 | sns.heatmap(df[gender_gap_cols].corr(), annot=True, cmap='coolwarm', center=0) 120 | plt.title('Gender Gap in Education (Correlation)') 121 | plt.tight_layout() 122 | plt.show() 123 | 124 | plt.figure(figsize=(12, 6)) 125 | grad_df = df[df['Age_group'] == 'All ages'].groupby('District')['Persons_Graduate'].sum() 126 | grad_df = grad_df.sort_values(ascending=False).head(10) 127 | grad_df.plot(kind='bar') 128 | plt.title('Top Districts by Graduate Population') 129 | plt.ylabel('Graduate Count') 130 | plt.xticks(rotation=45) 131 | plt.tight_layout() 132 | plt.show() 133 | 134 | plt.figure(figsize=(12, 6)) 135 | age_groups = ['7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69'] 136 | line_df = df[(df['Age_group'].isin(age_groups)) & (df['Total_Rural_Urban'] == 'Total')] 137 | line_df['Literacy_Rate_Persons'] = (line_df['Persons_Literate'] / line_df['Persons_Total'] * 100) 138 | line_df['Literacy_Rate_Males'] = (line_df['Males_Literate'] / line_df['Males_Total'] * 100) 139 | line_df['Literacy_Rate_Females'] = (line_df['Females_Literate'] / line_df['Females_Total'] * 100) 140 | line_agg = line_df.groupby('Age_group')[['Literacy_Rate_Males', 'Literacy_Rate_Females']].mean().reindex(age_groups) 141 | plt.plot(line_agg.index, line_agg['Literacy_Rate_Males'], marker='o', label='Males', color='blue') 142 | plt.plot(line_agg.index, line_agg['Literacy_Rate_Females'], marker='o', label='Females', color='red') 143 | plt.title('Literacy Rate by Age Group (Males vs. Females)') 144 | plt.xlabel('Age Group') 145 | plt.ylabel('Literacy Rate (%)') 146 | plt.xticks(rotation=45) 147 | plt.legend() 148 | plt.grid(True, linestyle='--', alpha=0.7) 149 | plt.tight_layout() 150 | plt.show() --------------------------------------------------------------------------------