├── README.md └── python project.py /README.md: -------------------------------------------------------------------------------- 1 | # Python-project-on-Border_Crossing_Data 2 | I use all the concept of DATA SCIENCE TOOLBOX: PYTHON PROGRAMMING 3 | 🔍 Data Loading & Inspection 4 | 📥 Loaded CSV into a DataFrame using pandas. 5 | 6 | 🧾 Checked info of the dataset: data types, non-null counts. 7 | 8 | 📊 Summary statistics using .describe(). 9 | 10 | 👀 Previewed data with .head() and .tail(). 11 | 12 | ❌ Dropped NA values and checked for missing data. 13 | 14 | 📌 Checked unique values in the 'Border' column. 15 | 16 | 🧹 Data Cleaning & Transformation 17 | 🧼 Filled missing values with "Unknown". 18 | 19 | 🏷️ Renamed columns: 'Port Name' → 'Port_Name', 'Measure' → 'Entry_Type'. 20 | 21 | 🔍 Filtered rows: 22 | 23 | Only US-Mexico Border crossings. 24 | 25 | US-Canada Border crossings with Value > 1000. 26 | 27 | 📈 Sorting, Aggregation, and Outlier Detection 28 | 📆 Sorted data by Date in descending order. 29 | 30 | ➕ Grouped data by 'Border' and summed 'Value'. 31 | 32 | 🚨 Detected outliers using IQR method. 33 | 34 | 🔗 Correlation & Covariance 35 | 📈 Computed correlation and covariance for numeric columns. 36 | 37 | 🧊 Plotted a heatmap to visualize correlations. 38 | 39 | 📅 Time Series Analysis 40 | 🧮 Grouped data by Date and Border to plot total crossings over time. 41 | 42 | 🗓️ Converted 'Date' column to datetime format. 43 | 44 | 📆 Created monthly aggregated border crossing totals. 45 | 46 | 📉 Plotted: 47 | 48 | Line chart for crossings over time by border. 49 | 50 | Monthly trends using line plots. 51 | 52 | 🛻 Trucks Analysis 53 | 🚚 Filtered data where Measure == 'Trucks'. 54 | 55 | 🧠 Calculated mean, median, std deviation for trucks. 56 | 57 | ⚖️ Performed a t-test comparing trucks between US-Canada and US-Mexico borders. 58 | 59 | 🧪 A/B Testing: January vs April 60 | 🧬 Simulated A/B test comparing crossings in Jan-24 vs Apr-24. 61 | 62 | ✅ Reported t-statistic and p-value. 63 | 64 | 📌 Concluded whether difference is statistically significant. 65 | 66 | -------------------------------------------------------------------------------- /python project.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | from scipy.stats import ttest_ind 6 | 7 | dic=pd.read_csv("C:\\Users\\junai\\OneDrive\\Desktop\\Border_Crossing_Entry_Data.csv") 8 | df=pd.DataFrame(dic) 9 | #print(df) 10 | 11 | infromation=df.info() 12 | #print(information) 13 | 14 | describe=df.describe() 15 | #print(describe) 16 | 17 | h_ead=df.head() 18 | #print(h_ead) 19 | 20 | t_ail=df.tail() 21 | #print(t_ail) 22 | 23 | drop_na=df.dropna() 24 | #print(drop_na) 25 | 26 | is_null_sum=df.isnull().sum() 27 | #print(is_null_sum) 28 | 29 | is_null_sum_sum=df.isnull().sum().sum() 30 | #print(is_null_sum_sum) 31 | 32 | is_null=df.isnull() 33 | #print(is_null) 34 | 35 | unique_value=df['Border'].unique() 36 | #print(unique_value) 37 | 38 | us_border_df = df[df['Border'] == 'US-Mexico Border'] 39 | #print(us_border_df.head()) 40 | 41 | df_filled = df.fillna("Unknown") 42 | #print(df_filled) 43 | 44 | df.rename(columns={'Port Name': 'Port_Name', 'Measure': 'Entry_Type'}, inplace=True) 45 | #print(df.columns) 46 | 47 | summary = df.groupby('Border')['Value'].sum() 48 | #print(summary) 49 | 50 | sorted_df = df.sort_values(by='Date', ascending=False) 51 | #print(sorted_df[['Date', 'Border', 'Value']].head()) 52 | 53 | filtered_df = df[(df['Border'] == 'US-Canada Border') & (df['Value'] > 1000)] 54 | #print(filtered_df) 55 | 56 | q1=df['Value'].quantile(0.25) 57 | q3=df['Value'].quantile(0.75) 58 | IQR=q3-q1 59 | lower_bond=q1-1.5*IQR 60 | upper_bond=q3+1.5*IQR 61 | Outlier=df[(df['Value']upper_bond)] 62 | #print(Outlier) 63 | 64 | correlation=df.corr(numeric_only=True) 65 | #print(correlation) 66 | 67 | covariance=df.cov(numeric_only=True) 68 | #print(covariance) 69 | 70 | 71 | 72 | ###Plot a time series line graph of total border crossings over time using the 73 | ##Date and Value columns. Customize the plot with a title, axis labels, rotated 74 | ##x-ticks, and a legend showing the border type. 75 | df_grouped = df.groupby(['Date', 'Border'], as_index=False)['Value'].sum() 76 | plt.figure(figsize=(12, 6)) 77 | sns.lineplot(data=df_grouped, x='Date', y='Value', hue='Border', marker="o") 78 | plt.title('Total Border Crossings Over Time by Border Type') 79 | plt.xlabel('Date') 80 | plt.ylabel('Number of Crossings') 81 | plt.xticks(rotation=45) 82 | plt.legend(title='Border Type') 83 | plt.tight_layout() 84 | plt.show() 85 | 86 | 87 | ##Perform an EDA on the dataset: 88 | ##Show the summary statistics (describe()) of numerical features. 89 | ###Compute and visualize the correlation between Value (number of crossings) and other relevant numeric features (e.g., time-based aggregation if applicable). 90 | 91 | df["Date"]=pd.to_datetime(df["Date"],format='%b-%y') 92 | correlation=df.corr(numeric_only=True) 93 | print(correlation) 94 | print("\n") 95 | sns.heatmap(correlation,annot=True,cmap="coolwarm") 96 | plt.title("Correlation Heatmap - Value vs Other Features") 97 | plt.show() 98 | monthly_agg = df.groupby(df['Date'].dt.to_period('M'))['Value'].sum().reset_index() 99 | monthly_agg['Date'] = monthly_agg['Date'].dt.to_timestamp() 100 | sns.lineplot(data=monthly_agg, x='Date', y='Value') 101 | plt.title('Monthly Total Border Crossings Over Time') 102 | plt.xlabel('Date') 103 | plt.ylabel('Total Crossings') 104 | plt.xticks(rotation=45) 105 | plt.tight_layout() 106 | plt.show() 107 | 108 | 109 | ##Analyze the monthly trend of border crossings by creating a time series plot of total crossings per month. 110 | 111 | 112 | import matplotlib.pyplot as plt 113 | df['Date'] = pd.to_datetime(df['Date'], format='%b-%y') 114 | df['Month'] = df['Date'].dt.month 115 | monthly_trends = df.groupby('Month')['Value'].sum() 116 | plt.figure(figsize=(10, 6)) 117 | monthly_trends.plot(kind='line', marker='o') 118 | plt.title('Monthly Border Crossing Trends') 119 | plt.xlabel('Month') 120 | plt.ylabel('Total Crossings') 121 | plt.grid(True) 122 | plt.show() 123 | 124 | 125 | 126 | ##Calculate the mean, median, and standard deviation of the Value column for the measure Trucks in the dataset. 127 | ##Perform a t-test to compare the mean Value of Trucks between the US-Canada Border and the US-Mexico Border. 128 | 129 | 130 | trucks_data = df[df['Measure'] == 'Trucks'] 131 | mean_value = trucks_data['Value'].mean() 132 | median_value = trucks_data['Value'].median() 133 | std_value = trucks_data['Value'].std() 134 | print(f"Mean: {mean_value}, Median: {median_value}, Std Dev: {std_value}") 135 | canada_trucks = df[(df['Measure'] == 'Trucks') & (df['Border'] == 'US-Canada Border')]['Value'] 136 | mexico_trucks = df[(df['Measure'] == 'Trucks') & (df['Border'] == 'US-Mexico Border')]['Value'] 137 | 138 | t_stat, p_value = ttest_ind(canada_trucks, mexico_trucks, equal_var=False) 139 | 140 | print(f"T-statistic: {t_stat}, P-value: {p_value}") 141 | 142 | 143 | 144 | ##Compare mean crossing values between January and April to see if there's a significant difference (simulated A/B test). 145 | 146 | jan_data = df[df['Date'] == 'Jan-24']['Value'] 147 | apr_data = df[df['Date'] == 'Apr-24']['Value'] 148 | t_stat, p_val = ttest_ind(jan_data, apr_data, equal_var=False) 149 | 150 | print(f"T-statistic: {t_stat:.2f}") 151 | print(f"P-value: {p_val:.4f}") 152 | alpha = 0.05 153 | if p_val < alpha: 154 | print("Reject null hypothesis - significant difference between months") 155 | else: 156 | print("Fail to reject null hypothesis - no significant difference") 157 | --------------------------------------------------------------------------------