├── Project.py ├── README.md └── railway.csv /Project.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | data= pd.read_csv("D:/UK+Train+Rides/railway.csv") 6 | #Display Basic info 7 | print(data.info()) 8 | print(data.head()) 9 | 10 | #check for missing values 11 | missing_values = data.isnull().sum() 12 | print(missing_values) 13 | 14 | #Handling the missing values 15 | 16 | 17 | 18 | #Drop rows with critical missing values 19 | data.dropna(subset=['Price', 'Departure Station', 'Arrival Destination'], inplace=True) 20 | 21 | # Fill missing delay reasons with 'No Delay' 22 | data['Reason for Delay'].fillna('No Delay', inplace=True) 23 | 24 | # Fill missing 'Refund Request' with 'No' 25 | data['Refund Request'].fillna('No', inplace=True) 26 | 27 | # Fill missing 'Railcard' with 'No Railcard' 28 | data['Railcard'].fillna('No Railcard', inplace=True) 29 | 30 | # Fill missing 'Actual Arrival Time' with 'mode of Actual Arrival Time' 31 | data['Actual Arrival Time'].fillna(data["Actual Arrival Time"].mode()[0], inplace=True) 32 | 33 | #1. Objective Ticket Sales Trends by Line Plot Graph 34 | 35 | data['Date of Purchase'] = pd.to_datetime(data['Date of Purchase']) 36 | 37 | sales_over_time = data.groupby('Date of Purchase')['Price'].sum() 38 | plt.figure(figsize=(10, 6)) 39 | sales_over_time.plot(kind='line') 40 | plt.title('Total Sales Over Time') 41 | plt.xlabel('Date') 42 | plt.ylabel('Total Sales (£)') 43 | plt.grid(True) 44 | plt.show() 45 | 46 | #2. Objective Delay Patterns on Heatmap 47 | 48 | data['Departure Hour'] = pd.to_datetime(data['Departure Time'], format='%H:%M:%S').dt.hour 49 | 50 | delayed_data = data[data['Journey Status'] == 'Delayed'] 51 | 52 | heatmap_data = delayed_data.pivot_table(index='Departure Station', columns='Departure Hour', aggfunc='size', fill_value=0) 53 | 54 | plt.figure(figsize=(14, 8)) 55 | sns.heatmap(heatmap_data, cmap='Reds', annot=True, fmt='d') 56 | plt.title('Number of Delays by Station and Hour') 57 | plt.xlabel('Hour of Day') 58 | plt.ylabel('Departure Station') 59 | plt.show() 60 | 61 | #3. Objective Ticket Pricing Insights on Box plot 62 | 63 | plt.figure(figsize=(10, 6)) 64 | sns.boxplot(data=data, x='Ticket Type', y='Price',palette='magma') 65 | plt.title('Ticket Prices by Ticket Type') 66 | plt.xlabel('Ticket Type') 67 | plt.ylabel('Price (£)') 68 | plt.grid(True) 69 | plt.show() 70 | 71 | #4. Objective Journey Delays on Bar Chart 72 | 73 | data['Scheduled Arrival'] = pd.to_datetime(data['Arrival Time'], format='%H:%M:%S') 74 | 75 | data['Actual Arrival'] = pd.to_datetime(data['Actual Arrival Time'], format='%H:%M:%S') 76 | 77 | data['Delay Minutes'] = (data['Actual Arrival'] - data['Scheduled Arrival']).dt.total_seconds() / 60 78 | 79 | data['Route'] = data['Departure Station'] + ' → ' + data['Arrival Destination'] 80 | 81 | route_delays = data.groupby('Route')['Delay Minutes'].mean().dropna() 82 | 83 | plt.figure(figsize=(14, 8)) 84 | route_delays.sort_values(ascending=False).plot(kind='bar') 85 | plt.title('Average Delay by Route') 86 | plt.xlabel('Route') 87 | plt.ylabel('Average Delay (minutes)') 88 | plt.grid(True) 89 | plt.show() 90 | 91 | #5. Objective Price vs Journey Distance on Scatter Plot 92 | 93 | np.random.seed(0) 94 | data['Estimated Distance (km)'] = np.random.randint(50, 500, size=len(data)) 95 | 96 | plt.figure(figsize=(10, 6)) 97 | plt.scatter(data['Estimated Distance (km)'], data['Price'],color='crimson') 98 | plt.title('Ticket Price vs Estimated Journey Distance') 99 | plt.xlabel('Estimated Distance (km)') 100 | plt.ylabel('Price (£)') 101 | plt.grid(True) 102 | plt.show() 103 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EDA-For-Python 2 | This project performs Exploratory Data Analysis on UK railway ticket data. It includes visual insights into ticket sales trends, delay patterns by station and time, ticket pricing distributions, route-based journey delays, and the relationship between price and estimated travel distance. 3 | --------------------------------------------------------------------------------