├── README.md
└── data


/README.md:
--------------------------------------------------------------------------------
 1 | Hospital Dataset Visualization & Analysis
 2 | 
 3 | It is a "Python-based data visualization and statistical analysis project" that explores a hospital dataset.
 4 | I used pandas,seaborn, matplotli, and scipy to perform data cleaning, generate visualizations, and run statistical tests
 5 | to uncover insights about patient demographics, medical conditions, billing, and hospital stay patterns
 6 | 
 7 |  FEATURES
 8 | "Data Cleaning & Preprocessing"
 9 | 
10 |   * Converts admission and discharge dates to 'datetime' format.
11 |   * Calculates "length of stay" for each patient.
12 |   * Handles categorical encoding for correlation analysis.
13 | 
14 | * "Data Exploration"
15 | 
16 |   * Displays basic dataset information, statistics, and missing values.
17 | 
18 | * "Visualizations"
19 | 
20 |   1. Blood Type Distribution – Bar chart of patient blood types.
21 |   2. Medical Condition Frequency – Count plot of the most common conditions.
22 |   3. Average Billing Over Time – Line chart showing yearly trends.
23 |   4. Billing by Admission Type – Box plot comparison.
24 |   5. Age Distribution by Gender – Histogram with FacetGrid.
25 |   6. Correlation Matrix – Heatmap of numeric & encoded categorical variables.
26 |   7. Pairplots – Scatter matrix of numerical features (with and without gender hue).
27 |   8. Blood Type Pie Chart – Patient percentage distribution.
28 | 
29 | # Statistical Analysis
30 | 
31 |   * Performs a 'T-test' to compare billing amounts between male and female patients.
32 | 
33 |   "Technologies Used"
34 | 
35 | * "Python 3.x"
36 | * Pandas – Data loading and preprocessing
37 | * NumPy – Numerical operations
38 | * Seaborn – Statistical visualizations
39 | * Matplotlib – Plotting
40 | * SciPy – Statistical tests
41 | 
42 | # Project Structure
43 | 
44 | hospital_analysis.py   # Main script with all visualizations and analysis
45 | hospital.csv           # Dataset file
46 | 
47 | # Installation & Setup
48 | 
49 | 1. Clone the repository
50 | 
51 |    bash
52 |    git clone https://github.com/hemanthcharano/hospital-data-visualization.git
53 |    cd hospital-data-visualization
54 |    
55 | 2. Install dependencies
56 | 
57 |    bash
58 |    pip install pandas numpy seaborn matplotlib scipy
59 |    
60 | 3. Place your dataset
61 | 
62 |    * Save 'hospital.csv' in the same directory as the script.
63 |    * Ensure it contains columns like:
64 |      Date of Admission, Discharge Date, Blood Type, Medical Condition,
65 |      Age, Billing Amount, Admission Type, Gender, Room Number
66 |      
67 | 4. Run the script
68 |    bash
69 |    python hospital_analysis.py
70 | 
71 | # How It Works
72 | 
73 | 1. Loads the CSV dataset into a pandas DataFrame.
74 | 2. Preprocesses dates and computes the stay length.
75 | 3. Generates multiple visualizations for different insights.
76 | 4. Performs a T-test to check if billing amounts differ significantly by gender.
77 | 5. Displays results and plots sequentially.
78 |    
79 | #  Example Output
80 | 
81 | * "Blood Type Distribution"
82 |   A pastel bar chart showing the count of patients per blood type.
83 | * "Average Billing Over Years"
84 |   A line chart highlighting billing trends over time.
85 | * "T-test Result"
86 | 
87 |   T-test result: t = -1.45, p = 0.1482
88 | 
89 |   Interpretation: No statistically significant difference in billing between genders (at 5% significance level).
90 | 
91 |  *Notes
92 | 
93 | * The dataset file 'hospital.csv' is **not included** in this repository for privacy reasons.
94 | * Ensure date columns in the dataset are in a format recognizable by 'pandas.to_datetime()'.
95 | * Some visualizations may require enough data diversity (e.g., at least two genders or multiple years).
96 | 


--------------------------------------------------------------------------------
/data:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import seaborn as sns
 4 | import matplotlib.pyplot as plt
 5 | from scipy import stats
 6 | 
 7 | # Load the dataset
 8 | df = pd.read_csv("C:\\Users\\heman\\Downloads\\hospital.csv")
 9 | # Set base theme
10 | sns.set_theme(style="whitegrid")
11 | # Data preprocessing
12 | df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
13 | df['Discharge Date'] = pd.to_datetime(df['Discharge Date'])
14 | df['Stay Length'] = (df['Discharge Date'] - df['Date of Admission']).dt.days
15 | # 1.Display basic info
16 | print(df.head())
17 | print(df.info())
18 | print(df.describe())
19 | print(df.isnull().sum())
20 | # 2.Bar Plot
21 | plt.figure(figsize=(8, 5))
22 | sns.countplot(data=df, x='Blood Type', hue='Blood Type', palette='pastel', legend=False)
23 | plt.title("Blood Type Distribution", fontsize=14, fontweight='bold')
24 | plt.xlabel("Blood Type")
25 | plt.ylabel("Count")
26 | plt.tight_layout()
27 | plt.show()
28 | 
29 | # 3. Medical Conditions
30 | plt.figure(figsize=(8, 5))
31 | sns.countplot(data=df, y='Medical Condition', hue='Medical Condition',order=df['Medical Condition'].value_counts().index,palette='crest', legend=False)
32 | plt.title("Most Common Medical Conditions", fontsize=14)
33 | plt.tight_layout()
34 | plt.show()
35 | # 4. Line graph - Avg billing per year
36 | df['Year'] = df['Date of Admission'].dt.year
37 | yearly_avg = df.groupby('Year')['Billing Amount'].mean().reset_index()
38 | plt.figure(figsize=(8, 5))
39 | sns.lineplot(data=yearly_avg, x='Year', y='Billing Amount', marker='o')
40 | plt.title("Avg Billing Amount Over Years")
41 | plt.show()
42 | # 5. Billing Amount by Admission Type
43 | plt.figure(figsize=(8, 5))
44 | sns.boxplot(data=df, x='Admission Type', y='Billing Amount',hue='Admission Type', palette='flare', legend=False)
45 | plt.title("Billing Amount by Admission Type", fontsize=14)
46 | plt.tight_layout()
47 | plt.show()
48 | 
49 | # 6. FacetGrid (add hue here too to avoid warnings)
50 | g = sns.FacetGrid(df, col="Gender", height=5, aspect=1.2)
51 | g.map_dataframe(sns.histplot, x="Age", kde=True, hue="Gender", multiple="stack", palette="Set2", edgecolor="black")
52 | g.set_axis_labels("Age", "Frequency")
53 | g.fig.suptitle("Age Distribution by Gender", fontsize=16, fontweight='bold')
54 | g.tight_layout()
55 | plt.subplots_adjust(top=0.85)
56 | plt.show()
57 | 
58 | # 7. Encoded Categorical Correlation Matrix
59 | df['Gender_Code'] = df['Gender'].astype('category').cat.codes
60 | df['Admission Type_Code'] = df['Admission Type'].astype('category').cat.codes
61 | plt.figure(figsize=(8, 6))
62 | sns.heatmap(df[['Age', 'Billing Amount', 'Stay Length', 'Gender_Code', 'Admission Type_Code']].corr(),
63 |             annot=True, cmap="YlGnBu", fmt=".2f", linewidths=0.5)
64 | plt.title("Correlation Matrix (Encoded)", fontsize=14)
65 | plt.tight_layout()
66 | plt.show()
67 | # 8. Pairplot
68 | sns.pairplot(df[['Age', 'Billing Amount', 'Room Number']], diag_kind='hist')
69 | plt.suptitle('Pairplot of Numerical Features', y=1.02)
70 | plt.show()
71 | # 9. Pairplot with hue
72 | sns.pairplot(df[['Age', 'Billing Amount', 'Room Number', 'Gender']], hue='Gender', palette='husl')
73 | plt.show()
74 | # Plot the pie chart
75 | gender_counts = df['Gender'].value_counts()
76 | blood_counts = df['Blood Type'].value_counts() 
77 | plt.figure(figsize=(7, 7))
78 | plt.pie(
79 |     blood_counts, 
80 |     labels=blood_counts.index, 
81 |     autopct='%1.1f%%', 
82 |     startangle=140, 
83 |     colors=sns.color_palette('Set3')
84 | )
85 | plt.title("Patient Distribution by Blood Group")
86 | plt.axis('equal')
87 | plt.show()
88 | 
89 | # 10. T-test for billing
90 | male_bills = df[df['Gender'] == 'Male']['Billing Amount']
91 | female_bills = df[df['Gender'] == 'Female']['Billing Amount']
92 | t_stat, p_val = stats.ttest_ind(male_bills, female_bills, equal_var=False)
93 | print(f"\nT-test result: t = {t_stat:.2f}, p = {p_val:.4f}")
94 | 


--------------------------------------------------------------------------------