├── README.md ├── PYTHON PROJECT REPORT.docx └── Python Project ├── railway_data_dictionary.csv └── project.py /README.md: -------------------------------------------------------------------------------- 1 | # UK-Train-Rides 2 | Python Project With Its Modules 3 | -------------------------------------------------------------------------------- /PYTHON PROJECT REPORT.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ankitkr16/UK-Train-Rides/HEAD/PYTHON PROJECT REPORT.docx -------------------------------------------------------------------------------- /Python Project/railway_data_dictionary.csv: -------------------------------------------------------------------------------- 1 | Field,Description 2 | Transaction ID,Unique identifier for an individual train ticket purchase 3 | Date of Purchase,Date the ticket was purchased 4 | Time of Purchase,Time the ticket was purchased 5 | Purchase Type,Whether the ticket was purchased online or directly at a train station 6 | Payment Method,"Payment method used to purchase the ticket (Contactles, Credit Card, or Debit Card)" 7 | Railcard,"Whether the passenger is a National Railcard holder (Adult, Senior, or Disabled) or not (None). Railcard holders get 1/3 off their ticket purchases." 8 | Ticket Class,Seat class for the ticket (Standard or First) 9 | Ticket Type,When you bought or can use the ticket. Advance tickets are 1/2 off and must be purchased at least a day prior to departure. Off-Peak tickets are 1/4 off and must be used outside of peak hours (weekdays between 6-8am and 4-6pm). Anytime tickets are full price and can be bought and used at any time during the day. 10 | Price,Final cost of the ticket 11 | Departure Station,Station to board the train 12 | Arrival Destination,Station to exit the train 13 | Date of Journey,Date the train departed 14 | Departure Time,Time the train departed 15 | Arrival Time,Time the train was scheduled to arrive at its destination (can be on the day after departure) 16 | Actual Arrival Time,Time the train arrived at its destination (can be on the day after departure) 17 | Journey Status,"Whether the train was on time, delayed, or cancelled" 18 | Reason for Delay,Reason for the delay or cancellation 19 | Refund Request,Whether the passenger requested a refund after a delay or cancellation 20 | -------------------------------------------------------------------------------- /Python Project/project.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | import numpy as np 5 | 6 | df = pd.read_csv("railway.csv") 7 | print(df) 8 | 9 | df.shape 10 | df.describe() 11 | df.info() 12 | 13 | #count by payment method and show on pie chart 14 | var1 = df["Payment Method"].value_counts() 15 | 16 | plt.figure(figsize=(4, 4)) 17 | plt.pie(var1,shadow=True, labels=var1.index, autopct="%1.1f%%") 18 | plt.title("Payment Method Distribution") 19 | plt.show() 20 | 21 | #find how many person purchase ticket from station and in advance with pie chart 22 | df["Purchase Type"].value_counts() 23 | df_station = df[df["Purchase Type"] == "Station"] 24 | var2=df_station["Ticket Type"].value_counts() 25 | plt.figure(figsize=(4, 4)) 26 | plt.pie(var2,shadow=True, labels=var2.index, autopct="%1.1f%%", colors=["#c85d46","#467fc8","#c8466b"], 27 | explode=(0.1,0,0)) 28 | plt.title("Ticket In Advanced Booked From Station") 29 | plt.show() 30 | 31 | 32 | 33 | 34 | #find out how many peoples purchase from online and having Adult RailCard 35 | df_online = df[df["Purchase Type"] == "Online"] 36 | var4 = df_online["Railcard"].value_counts() 37 | plt.figure(figsize=(4, 4)) 38 | plt.pie(var4, shadow=True, labels=var4.index, autopct="%1.1f%%", 39 | colors=["g", "r", "teal"], explode=(0,0.1,0)) 40 | plt.title("Purchase Ticket from Online by Railcard Type") 41 | plt.show() 42 | 43 | plt.figure(figsize=(4, 3)) 44 | plt.bar(var4.index, var4.values, color=["#c85d46", "#467fc8", "#c8466b"], width=0.4) 45 | plt.title("Railcard Usage in Online Purchases") 46 | plt.ylabel("Count") 47 | plt.show() 48 | 49 | #histogram using price Frequency column 50 | df.Price 51 | counts, bin_edges, _= plt.hist(df.Price,color="#467fc8", edgecolor="black", alpha=0.7, label="Hist") 52 | bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 53 | plt.plot(bin_centers, counts,marker='o', linestyle='-', color="red", markersize=6, label="Price Trend") 54 | plt.xticks(np.arange(0,250,20).tolist()) 55 | plt.title("Price Distribution") 56 | plt.xlabel("Price") 57 | plt.ylabel("Frequency") 58 | plt.legend() 59 | plt.show() 60 | 61 | 62 | #Visualize the count of each "Journey Status" using a simple bar graph. 63 | 64 | journey_counts = df["Journey Status"].value_counts() 65 | 66 | plt.figure(figsize=(8, 5)) 67 | plt.bar(journey_counts.index, journey_counts.values, edgecolor="black", alpha=0.8, width=0.4) 68 | 69 | plt.title("Journey Status Count", fontsize=14, color="darkblue") 70 | plt.xlabel("Journey Status", fontsize=12, color="purple") 71 | plt.ylabel("Count", fontsize=12, color="purple") 72 | plt.xticks(fontsize=10) 73 | plt.grid(axis="y", linestyle="--", alpha=0.6) 74 | plt.show() 75 | 76 | 77 | #Fill missing values in the "Reason for Delay" column using the most frequent reason (mode) and visualize the distribution using a line or scatter plot. 78 | 79 | df = df.copy() 80 | df["Reason for Delay"] = df["Reason for Delay"].fillna(df["Reason for Delay"].mode()[0]) 81 | delay_counts = df["Reason for Delay"].value_counts() 82 | 83 | plt.figure(figsize=(10, 5)) 84 | plt.plot(delay_counts.index, delay_counts.values, marker="o", linestyle="-", color="b", markerfacecolor="red") 85 | 86 | plt.title("Counts of Reasons for Delay (After Filling Missing Values)", fontsize=14, color="darkblue") 87 | plt.xlabel("Reason for Delay", fontsize=12, color="purple") 88 | plt.ylabel("Count", fontsize=12, color="purple") 89 | plt.xticks(rotation=45, ha="right", fontsize=10) 90 | plt.grid(axis="y", linestyle="--", alpha=0.6) 91 | plt.show() 92 | 93 | 94 | 95 | plt.figure(figsize=(10, 5)) 96 | plt.scatter(delay_counts.index, delay_counts.values, color="red", edgecolor="black") 97 | 98 | plt.title("Counts of Reasons for Delay (After Filling Missing Values)", fontsize=14, color="darkblue") 99 | plt.xlabel("Reason for Delay", fontsize=12, color="purple") 100 | plt.ylabel("Count", fontsize=12, color="purple") 101 | plt.xticks(rotation=45, ha="right", fontsize=10) 102 | plt.grid(axis="y", linestyle="--", alpha=0.6) 103 | 104 | plt.show() 105 | 106 | 107 | 108 | 109 | #Clean the "Railcard" column by filling missing values with the most frequent Railcard type (mode) and visualize its distribution using appropriate graphs. 110 | 111 | 112 | df["Railcard"] = df["Railcard"].fillna(df["Railcard"].mode()[0]) 113 | railcard_counts = df["Railcard"].value_counts() 114 | 115 | #pie chart 116 | plt.figure(figsize=(4, 4)) 117 | plt.pie(railcard_counts, labels=railcard_counts.index, autopct="%1.1f%%", colors=["#c85d46","#467fc8","#46c87a","#c8466b"], startangle=140) 118 | plt.title("Railcard Type Distribution (After Cleaning)") 119 | plt.show() 120 | 121 | #horizontal bar graph 122 | plt.figure(figsize=(6, 3)) 123 | plt.barh(railcard_counts.index, railcard_counts.values, color="#467fc8", edgecolor="black", alpha=0.8) 124 | plt.xlabel("Count") 125 | plt.ylabel("Railcard Type") 126 | plt.title("Railcard Count After Filling Missing Values") 127 | plt.grid(axis="x", linestyle="--", alpha=0.6) 128 | plt.show() 129 | 130 | 131 | 132 | #Standardized "Arrival Time" format and filled missing values with the most frequent time. 133 | df = df.copy() 134 | df["Arrival Time"] = df["Arrival Time"].astype(str).str.strip() 135 | 136 | df["Arrival Time"] = pd.to_datetime(df["Arrival Time"], format="%H:%M:%S", errors="coerce") 137 | 138 | mode_values = df["Arrival Time"].mode() 139 | if not mode_values.empty: 140 | most_frequent_time = mode_values[0] 141 | else: 142 | most_frequent_time = pd.to_datetime("00:00:00") 143 | 144 | df["Arrival Time"] = df["Arrival Time"].fillna(most_frequent_time) 145 | 146 | 147 | 148 | df["Actual Arrival Time"] = df["Actual Arrival Time"].astype(str).str.strip() 149 | df["Actual Arrival Time"] = pd.to_datetime(df["Actual Arrival Time"], format="%H:%M:%S", errors="coerce") 150 | 151 | mode_actual_values = df["Actual Arrival Time"].mode() 152 | if not mode_actual_values.empty: 153 | most_frequent_actual_time = mode_actual_values[0] 154 | else: 155 | most_frequent_actual_time = pd.to_datetime("00:00:00") 156 | 157 | df["Actual Arrival Time"] = df["Actual Arrival Time"].fillna(most_frequent_actual_time) 158 | 159 | 160 | 161 | # Final Cleaned Data 162 | df.to_csv("final_data.csv", index=False) 163 | 164 | print("Cleaned data saved successfully as 'final_data.csv'.") 165 | df.info() 166 | 167 | 168 | 169 | 170 | #analyze the correlation between Price, Refund Request, and Ticket Class to understand how ticket pricing impacts refund requests and different ticket classes. 171 | final_df = pd.read_csv("final_data.csv") 172 | 173 | final_df["Refund Request"] = final_df["Refund Request"].map({"Yes": 1, "No": 0}) 174 | final_df["Ticket Class"] = final_df["Ticket Class"].astype("category").cat.codes 175 | 176 | numeric_cols = final_df[["Price", "Refund Request", "Ticket Class"]] 177 | 178 | plt.figure(figsize=(4, 3)) 179 | sns.heatmap(numeric_cols.corr(), annot=True, cmap="coolwarm", linewidths=0.5) 180 | plt.title("Heatmap of Price, Refund Request & Ticket Class") 181 | plt.show() 182 | 183 | 184 | 185 | #Boxplot on the basis of price to check their is outlet or not 186 | 187 | plt.figure(figsize=(6, 4)) 188 | sns.boxplot(y=final_df["Price"], color="teal") 189 | plt.title("Box Plot of Price") 190 | plt.ylabel("Price") 191 | plt.show() 192 | 193 | 194 | 195 | # Donut Chart for 'Railcard' Distribution 196 | railcard_counts = final_df["Railcard"].value_counts() 197 | plt.figure(figsize=(6, 6)) 198 | plt.pie(railcard_counts, labels=railcard_counts.index, autopct='%1.1f%%', 199 | wedgeprops={"edgecolor": "black"}, startangle=140, pctdistance=0.85) 200 | # Draw a circle at the center to make it a donut chart 201 | centre_circle = plt.Circle((0, 0), 0.70, fc='white') 202 | plt.gca().add_artist(centre_circle) 203 | 204 | plt.title("Donut Chart of Railcard Distribution") 205 | plt.show() 206 | 207 | 208 | 209 | #KDE 210 | final_df = pd.read_csv("final_data.csv") 211 | 212 | # KDE Plot for Price 213 | plt.figure(figsize=(8, 5)) 214 | sns.kdeplot(final_df["Price"], fill=True, color="blue") 215 | plt.title("KDE Plot of Price") 216 | plt.xlabel("Price") 217 | plt.ylabel("Density") 218 | plt.show() 219 | 220 | 221 | 222 | #pair plot 223 | final_df = pd.read_csv("final_data.csv") 224 | 225 | # Ensure "Departure Time" is properly formatted 226 | final_df["Departure Time"] = pd.to_datetime(final_df["Departure Time"], format="%H:%M:%S", errors="coerce") 227 | 228 | # Drop rows where conversion failed 229 | final_df = final_df.dropna(subset=["Departure Time"]) 230 | 231 | # Convert Departure Time to total seconds 232 | final_df["Departure Time (Seconds)"] = final_df["Departure Time"].dt.hour * 3600 + \ 233 | final_df["Departure Time"].dt.minute * 60 + \ 234 | final_df["Departure Time"].dt.second 235 | 236 | # Select numerical columns for pairplot 237 | selected_cols = ["Price", "Departure Time (Seconds)"] 238 | 239 | sns.pairplot(final_df[selected_cols], diag_kind="kde", plot_kws={'color': 'purple'}, diag_kws={'color': 'green'}) 240 | plt.show() 241 | 242 | 243 | 244 | 245 | # Stacked Bar Chart for Ticket Type and Payment Method Distribution 246 | plt.figure(figsize=(8, 5)) 247 | ticket_payment_counts = pd.crosstab(final_df["Ticket Type"], final_df["Payment Method"]) 248 | ticket_payment_counts.plot(kind="bar", stacked=True, color=["#467fc8", "#46c87a", "#c85d46", "#c8466b"]) 249 | plt.title("Stacked Bar Chart of Ticket Type and Payment Method Distribution") 250 | plt.xlabel("Ticket Type") 251 | plt.ylabel("Count") 252 | plt.legend(title="Payment Method") 253 | plt.show() 254 | 255 | 256 | 257 | 258 | # Count Plot for Ticket Type and Ticket Class 259 | plt.figure(figsize=(8, 5)) 260 | sns.countplot(x="Ticket Type", hue="Ticket Class", data=final_df, palette=["#8E44AD", "#3498DB"]) 261 | plt.title("Count Plot of Ticket Type and Ticket Class") 262 | plt.xlabel("Ticket Type") 263 | plt.ylabel("Count") 264 | plt.legend(title="Ticket Class") 265 | plt.show() 266 | 267 | 268 | df.isnull().sum() --------------------------------------------------------------------------------