├── Cleaned_Suicide_Data.csv ├── Death_rates_for_suicide__by_sex__race__Hispanic_origin__and_age__United_States.csv ├── Figure_1.png ├── Figure_2.png ├── Figure_3.png ├── Figure_4.png ├── Figure_5.png ├── Figure_6.png ├── Figure_7.png ├── Objective5_Multivariate_Outliers.csv ├── python report.docx └── sneha ca2.py /Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snehasuman019/PythonProject/b006584f99d8cf153dd7318a73c7c99214e9b0d0/Figure_1.png -------------------------------------------------------------------------------- /Figure_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snehasuman019/PythonProject/b006584f99d8cf153dd7318a73c7c99214e9b0d0/Figure_2.png -------------------------------------------------------------------------------- /Figure_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snehasuman019/PythonProject/b006584f99d8cf153dd7318a73c7c99214e9b0d0/Figure_3.png -------------------------------------------------------------------------------- /Figure_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snehasuman019/PythonProject/b006584f99d8cf153dd7318a73c7c99214e9b0d0/Figure_4.png -------------------------------------------------------------------------------- /Figure_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snehasuman019/PythonProject/b006584f99d8cf153dd7318a73c7c99214e9b0d0/Figure_5.png -------------------------------------------------------------------------------- /Figure_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snehasuman019/PythonProject/b006584f99d8cf153dd7318a73c7c99214e9b0d0/Figure_6.png -------------------------------------------------------------------------------- /Figure_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snehasuman019/PythonProject/b006584f99d8cf153dd7318a73c7c99214e9b0d0/Figure_7.png -------------------------------------------------------------------------------- /Objective5_Multivariate_Outliers.csv: -------------------------------------------------------------------------------- 1 | indicator,unit,unit_num,stub_name,stub_name_num,stub_label,stub_label_num,year,year_num,age,age_num,estimate,flag,gender,age_group 2 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 75-84 years,3.152,1950,1,75-84 years,5.2,58.3,,Male,75-84 years 3 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 75-84 years,3.152,1985,9,75-84 years,5.2,53.1,,Male,75-84 years 4 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 75-84 years,3.152,1986,10,75-84 years,5.2,54.9,,Male,75-84 years 5 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 75-84 years,3.152,1987,11,75-84 years,5.2,57.2,,Male,75-84 years 6 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 75-84 years,3.152,1988,12,75-84 years,5.2,57.3,,Male,75-84 years 7 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 75-84 years,3.152,1990,14,75-84 years,5.2,56.1,,Male,75-84 years 8 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 75-84 years,3.152,1991,15,75-84 years,5.2,53.0,,Male,75-84 years 9 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1950,1,85 years and over,6.0,58.3,,Male,85 years and over 10 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1960,2,85 years and over,6.0,57.4,,Male,85 years and over 11 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1983,7,85 years and over,6.0,54.2,,Male,85 years and over 12 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1985,9,85 years and over,6.0,56.2,,Male,85 years and over 13 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1986,10,85 years and over,6.0,62.1,,Male,85 years and over 14 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1987,11,85 years and over,6.0,67.5,,Male,85 years and over 15 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1988,12,85 years and over,6.0,61.5,,Male,85 years and over 16 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1989,13,85 years and over,6.0,68.2,,Male,85 years and over 17 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1990,14,85 years and over,6.0,65.9,,Male,85 years and over 18 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1991,15,85 years and over,6.0,69.5,,Male,85 years and over 19 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1992,16,85 years and over,6.0,62.4,,Male,85 years and over 20 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1993,17,85 years and over,6.0,67.9,,Male,85 years and over 21 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1994,18,85 years and over,6.0,66.2,,Male,85 years and over 22 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1995,19,85 years and over,6.0,62.7,,Male,85 years and over 23 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1996,20,85 years and over,6.0,60.5,,Male,85 years and over 24 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1997,21,85 years and over,6.0,61.0,,Male,85 years and over 25 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1998,22,85 years and over,6.0,59.8,,Male,85 years and over 26 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,1999,23,85 years and over,6.0,57.1,,Male,85 years and over 27 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,2000,24,85 years and over,6.0,57.5,,Male,85 years and over 28 | Death rates for suicide,"Deaths per 100,000 resident population, crude",2,Sex and age,3,Male: 85 years and over,3.153,2002,26,85 years and over,6.0,54.5,,Male,85 years and over 29 | -------------------------------------------------------------------------------- /python report.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/snehasuman019/PythonProject/b006584f99d8cf153dd7318a73c7c99214e9b0d0/python report.docx -------------------------------------------------------------------------------- /sneha ca2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | # Load the dataset 6 | df_raw= pd.read_csv(("C:\\Users\\Sneha\\Desktop\\death rate py dataset.csv"),encoding="Latin1") 7 | print(df_raw) 8 | 9 | 10 | 11 | # -------------------- NULL CHECKS -------------------- 12 | print("\n--- Null Values Before Cleaning ---") 13 | print(df_raw.isnull().sum()) 14 | 15 | 16 | df = df_raw.copy() 17 | df.columns = df.columns.str.strip().str.lower() 18 | df = df[df['estimate'].notnull()] 19 | df = df[~df['estimate'].astype(str).str.contains('Suppressed|Unreliable', na=False)] 20 | df['estimate'] = pd.to_numeric(df['estimate'], errors='coerce') 21 | df = df.dropna(subset=['estimate']) 22 | print("\n--- Null Values After Cleaning ---") 23 | print(df.isnull().sum()) 24 | 25 | # -------------------- EDA -------------------- 26 | print("\n--- Dataset Info ---") 27 | print(df.info()) 28 | 29 | print("\n--- First 5 Rows ---") 30 | print(df.head()) 31 | 32 | print("\n--- Summary Statistics ---") 33 | print(df['estimate'].describe()) 34 | 35 | print("\n--- Unique Values ---") 36 | for col in ['year', 'age', 'stub_name', 'stub_label']: 37 | print(f"{col}: {df[col].nunique()} unique values") 38 | 39 | print("\n--- Value Counts for 'stub_name' ---") 40 | print(df['stub_name'].value_counts()) 41 | 42 | print("\n--- Correlation Matrix (Numerical Columns) ---") 43 | print(df.select_dtypes(include='number').corr()) 44 | 45 | # -------------------- 1. Suicide Rates by Age Group (Horizontal Bar Chart) -------------------- 46 | # age_df = df[(df['stub_name'].str.contains('Age')) & (df['age'] != 'All ages')] 47 | # age_grouped = age_df.groupby('age')['estimate'].mean().reset_index() 48 | 49 | # age_order = ['<1 year', '1-4 years', '5-14 years', '15-19 years', '20-24 years', '25-34 years', 50 | # '35-44 years', '45-54 years', '55-64 years', '65-74 years', '75-84 years', '85 years and over'] 51 | # age_grouped['age'] = pd.Categorical(age_grouped['age'], categories=age_order, ordered=True) 52 | # age_grouped = age_grouped.sort_values('age') 53 | 54 | # plt.figure(figsize=(10, 8)) 55 | # sns.barplot(data=age_grouped, y='age', x='estimate', hue='age', palette='viridis', legend=True) 56 | # plt.title("1a. Average Suicide Rates by Age Group (Horizontal Bar Chart)") 57 | # plt.xlabel("Suicide Rate per 100,000") 58 | # plt.ylabel("Age Group") 59 | # plt.legend(title="Age Group", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.) 60 | # plt.grid(False) # Remove gridlines 61 | # plt.subplots_adjust(right=0.75) 62 | # plt.show() 63 | 64 | # # -------------------- 1b. Suicide Rates by Age Group (Histogram) -------------------- 65 | # plt.figure(figsize=(10, 6)) 66 | # sns.histplot(data=age_df, x='estimate', hue='age', multiple='stack', bins=30, palette='viridis') 67 | # plt.title("1b. Distribution of Suicide Rates by Age Group (Histogram)") 68 | # plt.xlabel("Suicide Rate per 100,000") 69 | # plt.ylabel("Count") 70 | # plt.legend(title="Age Group", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.) 71 | # plt.grid(False) # Remove gridlines 72 | # plt.subplots_adjust(right=0.75) # Adjust plot to make room for legend 73 | # plt.show() 74 | 75 | 76 | 77 | 78 | 79 | # -------------------- 1. Suicide Rates by Age Group (With Outlier Detection + Heatmap) -------------------- 80 | age_df = df[(df['stub_name'].str.contains('Age')) & (df['age'] != 'All ages')] 81 | 82 | # Outlier detection using IQR 83 | Q1 = age_df['estimate'].quantile(0.25) 84 | Q3 = age_df['estimate'].quantile(0.75) 85 | IQR = Q3 - Q1 86 | lower_bound = Q1 - 1.5 * IQR 87 | upper_bound = Q3 + 1.5 * IQR 88 | 89 | # Print how many outliers 90 | num_outliers = ((age_df['estimate'] < lower_bound) | (age_df['estimate'] > upper_bound)).sum() 91 | print(f"\n[Outlier Detection] Number of outliers removed from 'estimate': {num_outliers}") 92 | 93 | # Remove outliers 94 | age_df_clean = age_df[(age_df['estimate'] >= lower_bound) & (age_df['estimate'] <= upper_bound)] 95 | 96 | # Group and order by age 97 | age_grouped = age_df_clean.groupby('age')['estimate'].mean().reset_index() 98 | age_order = ['<1 year', '1-4 years', '5-14 years', '15-19 years', '20-24 years', '25-34 years', 99 | '35-44 years', '45-54 years', '55-64 years', '65-74 years', '75-84 years', '85 years and over'] 100 | age_grouped['age'] = pd.Categorical(age_grouped['age'], categories=age_order, ordered=True) 101 | age_grouped = age_grouped.sort_values('age') 102 | 103 | # 1a. Horizontal Bar Chart 104 | plt.figure(figsize=(10, 8)) 105 | sns.barplot(data=age_grouped, y='age', x='estimate', hue='age', palette='viridis', legend=True) 106 | plt.title("1a. Average Suicide Rates by Age Group (Outliers Removed)") 107 | plt.xlabel("Suicide Rate per 100,000") 108 | plt.ylabel("Age Group") 109 | plt.legend(title="Age Group", bbox_to_anchor=(1.05, 1), loc='upper left') 110 | plt.grid(False) 111 | plt.subplots_adjust(right=0.75) 112 | plt.show() 113 | 114 | # 1b. Histogram (with cleaned data) 115 | plt.figure(figsize=(10, 6)) 116 | sns.histplot(data=age_df_clean, x='estimate', hue='age', multiple='stack', bins=30, palette='viridis') 117 | plt.title("1b. Distribution of Suicide Rates by Age Group (Outliers Removed)") 118 | plt.xlabel("Suicide Rate per 100,000") 119 | plt.ylabel("Count") 120 | plt.legend(title="Age Group", bbox_to_anchor=(1.05, 1), loc='upper left') 121 | plt.grid(False) 122 | plt.subplots_adjust(right=0.75) 123 | plt.show() 124 | 125 | # 1c. Heatmap: Correlation Between Numerical Features (if any) 126 | # First ensure the relevant numerical columns are selected 127 | numerical_cols = age_df_clean.select_dtypes(include='number') 128 | correlation_matrix = numerical_cols.corr() 129 | 130 | plt.figure(figsize=(8, 6)) 131 | sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f') 132 | plt.title("1c. Correlation Heatmap of Numerical Features (Age-Related Data)") 133 | plt.show() 134 | 135 | # -------------------- 2. Compare Suicide Rates Across Genders -------------------- 136 | gender_df = df[df['stub_name'].str.contains('Sex') & df['stub_label'].str.contains('Female|Male')] 137 | gender_grouped = gender_df.groupby(['year', 'stub_label'])['estimate'].mean().reset_index() 138 | 139 | plt.figure(figsize=(10, 6)) 140 | sns.lineplot(data=gender_grouped, x='year', y='estimate', hue='stub_label', marker='o') 141 | plt.title("2. Suicide Rates Over Time by Gender") 142 | plt.xlabel("Year") 143 | plt.ylabel("Suicide Rate per 100,000") 144 | plt.legend(title="Gender", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.) 145 | plt.grid(False) # Remove gridlines 146 | plt.subplots_adjust(right=0.75) # Adjust plot to make room for legend 147 | plt.show() 148 | 149 | # -------------------- 3. Impact of Race and Ethnicity -------------------- 150 | race_df = df[df['stub_name'].str.contains('Race|Hispanic', case=False, na=False)] 151 | race_grouped = race_df.groupby(['year', 'stub_label'])['estimate'].mean().reset_index() 152 | 153 | plt.figure(figsize=(14, 8)) 154 | top_races = race_grouped['stub_label'].value_counts().head(10).index 155 | sns.lineplot(data=race_grouped[race_grouped['stub_label'].isin(top_races)], 156 | x='year', y='estimate', hue='stub_label', marker='o') 157 | plt.title("3. Suicide Rates by Race and Ethnicity Over Time") 158 | plt.xlabel("Year") 159 | plt.ylabel("Suicide Rate per 100,000") 160 | plt.legend(title="Age Group", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.) 161 | plt.grid(False) # Remove gridlines 162 | plt.subplots_adjust(right=0.75) # Adjust plot to make room for legend 163 | plt.show() 164 | 165 | # -------------------- 4. Time Series Analysis of National Suicide Trends -------------------- 166 | total_df = df[df['stub_label'] == 'All persons'] 167 | national_trend = total_df.groupby('year')['estimate'].mean().reset_index() 168 | plt.figure(figsize=(10, 6)) 169 | sns.lineplot(data=national_trend, x='year', y='estimate', marker='o') 170 | plt.title("4. National Suicide Rate Trend Over Time") 171 | plt.xlabel("Year") 172 | plt.ylabel("Suicide Rate per 100,000") 173 | plt.grid(False) # Remove gridlines 174 | plt.tight_layout() 175 | plt.show() 176 | # -------------------- 5. Multivariate Interaction: Gender vs Age -------------------- 177 | multi_df = df[df['stub_name'].str.contains('Sex and age')].copy() 178 | extracted = multi_df['stub_label'].str.extract(r'(Male|Female):\s*(.*)') 179 | multi_df['gender'] = extracted[0] 180 | multi_df['age_group'] = extracted[1] 181 | multi_df = multi_df.dropna(subset=['gender', 'age_group']) 182 | multi_df = multi_df[multi_df['age_group'] != 'All ages'] 183 | plt.figure(figsize=(16, 8)) 184 | sns.lineplot(data=multi_df, x='year', y='estimate', hue='gender', style='age_group', markers=True) 185 | plt.title("5. Multivariate Interaction: Gender vs Age Over Time") 186 | plt.xlabel("Year") 187 | plt.ylabel("Suicide Rate per 100,000") 188 | plt.legend(title="Gender", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.) 189 | plt.grid(False) # Remove gridlines 190 | plt.subplots_adjust(right=0.75) # Adjust plot to make room for legend 191 | # Add separate legend for age_group 192 | handles, labels = plt.gca().get_legend_handles_labels() 193 | age_labels = [l for l in labels if l not in ['Male', 'Female']] 194 | age_handles = [h for h, l in zip(handles, labels) if l not in ['Male', 'Female']] 195 | plt.legend(handles[:2], labels[:2], title="Gender", bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.) 196 | plt.gca().add_artist(plt.legend(age_handles, age_labels, title="Age Group", bbox_to_anchor=(1.05, 0.7), loc='upper left', borderaxespad=0.)) 197 | plt.show() 198 | # Save cleaned DataFrame to CSV 199 | df.to_csv("Cleaned_Suicide_Data.csv", index=False) 200 | print("Cleaned data saved as Cleaned_Suicide_Data.csv") --------------------------------------------------------------------------------