├── LICENSE ├── README.md ├── Sub_Division_IMD_2017.csv └── rainfall.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 gobarihimanshu071 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rainfall Analysis and Prediction System 2 | 3 | A comprehensive Python-based system for analyzing and predicting rainfall patterns across different subdivisions of India using machine learning techniques. 4 | 5 | ## Overview 6 | 7 | This project provides tools for: 8 | - Loading and cleaning rainfall data 9 | - Visualizing rainfall patterns over time 10 | - Analyzing monthly rainfall distributions 11 | - Training machine learning models to predict rainfall patterns 12 | - Generating rainfall forecasts using ARIMA models 13 | - Creating various visualizations including heatmaps and 3D plots 14 | 15 | ## Features 16 | 17 | - **Data Processing** 18 | - Automated data cleaning and preprocessing 19 | - Handling missing values using median imputation 20 | - Feature engineering for rainfall classification 21 | 22 | - **Visualization** 23 | - Yearly rainfall trends 24 | - Monthly rainfall distribution analysis 25 | - Correlation heatmaps 26 | - 3D rainfall visualization 27 | - Average rainfall heatmaps 28 | 29 | - **Machine Learning** 30 | - Multiple model implementations (Random Forest, Gradient Boosting, XGBoost) 31 | - Model evaluation and comparison 32 | - Feature importance analysis 33 | - Rainfall classification 34 | 35 | - **Forecasting** 36 | - ARIMA-based rainfall forecasting 37 | - Long-term rainfall predictions 38 | 39 | ## Requirements 40 | 41 | - Python 3.x 42 | - pandas 43 | - numpy 44 | - matplotlib 45 | - seaborn 46 | - scikit-learn 47 | - xgboost 48 | - statsmodels 49 | 50 | ## Installation 51 | 52 | 1. Clone this repository 53 | 2. Install the required packages: 54 | ```bash 55 | pip install pandas numpy matplotlib seaborn scikit-learn xgboost statsmodels 56 | ``` 57 | 58 | ## Usage 59 | 60 | The main script `rainfall.py` contains several functions for different analyses: 61 | 62 | ```python 63 | # Load and clean data 64 | data = load_and_clean("Sub_Division_IMD_2017.csv") 65 | 66 | # Visualize yearly trends 67 | plot_yearly_trend(data, "KERALA") 68 | 69 | # Analyze monthly patterns 70 | plot_monthly_spread(data) 71 | 72 | # Train machine learning models 73 | train_rain_model(data) 74 | 75 | # Generate forecasts 76 | forecast_annual_rainfall_arima(data, forecast_years=10) 77 | ``` 78 | 79 | ## Data 80 | 81 | The project uses the `Sub_Division_IMD_2017.csv` dataset, which contains: 82 | - Monthly rainfall data 83 | - Annual rainfall totals 84 | - Subdivision information 85 | - Yearly records 86 | 87 | ## Contributing 88 | 89 | Contributions are welcome! Please feel free to submit a Pull Request. 90 | 91 | ## License 92 | 93 | This project is open source and available under the MIT License. 94 | -------------------------------------------------------------------------------- /rainfall.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import accuracy_score,precision_score 8 | from sklearn.metrics import confusion_matrix 9 | import sklearn 10 | from xgboost import XGBClassifier 11 | from mpl_toolkits.mplot3d import Axes3D 12 | from statsmodels.tsa.arima.model import ARIMA 13 | 14 | def load_and_clean(file_path): 15 | data=pd.read_csv(file_path,na_values=["NA"]) 16 | print("Data size:", data.shape) 17 | print("Columns: ",data.columns.tolist()) 18 | 19 | data.dropna(subset=["ANNUAL"],inplace=True) 20 | print("Rows after dropping missing Annual: ", len(data)) 21 | 22 | months = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] 23 | for month in months: 24 | data[month]=data.groupby("SUBDIVISION")[month].transform(lambda x: x.fillna(x.median())) 25 | 26 | data["High_Rainfall"]=(data["ANNUAL"]>data["ANNUAL"].quantile(0.75)).astype(int) 27 | return data 28 | 29 | def plot_yearly_trend(data,area): 30 | subset = data[data["SUBDIVISION"]== area] 31 | plt.figure(figsize=(10,5)) 32 | plt.plot(subset["YEAR"], subset["ANNUAL"], color="blue") 33 | plt.title(f"Rainfall over time- {area}") 34 | plt.xlabel("Year") 35 | plt.ylabel("Rainfall (mm)") 36 | plt.grid(True) 37 | plt.show() 38 | 39 | def plot_monthly_spread(data): 40 | months= data[["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]] 41 | plt.figure(figsize=(12,6)) 42 | sns.boxplot(data=months) 43 | plt.title("How rainfall varies by month") 44 | plt.xlabel("Month") 45 | plt.ylabel("Rainfall (mm)") 46 | plt.xticks(rotation=45) 47 | plt.show() 48 | 49 | def train_rain_model(data): 50 | months = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] 51 | X=data[months] 52 | y=data["High_Rainfall"] 53 | 54 | X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2 , random_state=42) 55 | 56 | rf_model=RandomForestClassifier(n_estimators=100, random_state=42) 57 | rf_model.fit(X_train, y_train) 58 | rf_pred=rf_model.predict(X_test) 59 | rf_accuracy=accuracy_score(y_test,rf_pred) 60 | rf_precision=precision_score(y_test,rf_pred) 61 | print("Random Forest Results") 62 | print(f"Accuracy: {rf_accuracy:.2f}") 63 | print(f"Precision: {rf_precision:.2f}") 64 | 65 | gb_model=GradientBoostingClassifier(n_estimators=100,random_state=42) 66 | gb_model.fit(X_train,y_train) 67 | gb_pred=gb_model.predict(X_test) 68 | gb_accuracy=accuracy_score(y_test,gb_pred) 69 | gb_precision=precision_score(y_test,gb_pred) 70 | 71 | print("Gradient Boosting Results") 72 | print(f"Accuracy: {gb_accuracy:.2f}") 73 | print(f"Precision: {gb_precision:.2f}") 74 | 75 | xgb_model=XGBClassifier(n_estimators=100,random_state=42,eval_metric='logloss') 76 | xgb_model.fit(X_train,y_train) 77 | xgb_pred=xgb_model.predict(X_test) 78 | xgb_accuracy=accuracy_score(y_test,xgb_pred) 79 | xgb_precision = precision_score(y_test,xgb_pred) 80 | print("XGBBoost Results:") 81 | print(f" Accuracy:{xgb_accuracy:.2f}") 82 | print(f" Precision:{xgb_precision:.2f}") 83 | 84 | return rf_model,gb_model,xgb_model, X_test, y_test 85 | 86 | def plot_confusion_matrix(rf_model,gb_model,xgb_model, X_test, y_test): 87 | rf_pred= rf_model.predict(X_test) 88 | gb_pred= gb_model.predict(X_test) 89 | xgb_pred=xgb_model.predict(X_test) 90 | 91 | rf_cm=confusion_matrix(y_test, rf_pred) 92 | gb_cm=confusion_matrix(y_test, gb_pred) 93 | xgb_cm=confusion_matrix(y_test, xgb_pred) 94 | 95 | fig, (ax1,ax2,ax3)=plt.subplots(1,3,figsize=(18,6)) 96 | sns.heatmap(rf_cm,annot=True,fmt="d",cmap="Blues",xticklabels=["Low","High"],yticklabels=["Low","High"],ax=ax1) 97 | ax1.set_title("Random Forest Confusion Matrix", fontsize=12, pad=10) 98 | ax1.set_xlabel("Predicted", fontsize=10) 99 | ax1.set_ylabel("Actual", fontsize=10) 100 | 101 | sns.heatmap(gb_cm,annot=True,fmt="d",cmap="Blues",xticklabels=["Low","High"],yticklabels=["Low","High"],ax=ax2) 102 | ax2.set_title("Gradient Boosting Confusion Matrix", fontsize=12, pad=10) 103 | ax2.set_xlabel("Predicted", fontsize=10) 104 | ax2.set_ylabel("Actual", fontsize=10) 105 | 106 | sns.heatmap(xgb_cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Low", "High"], yticklabels=["Low", "High"], ax=ax3) 107 | ax3.set_title("XGBoost Confusion Matrix", fontsize=12, pad=10) 108 | ax3.set_xlabel("Predicted", fontsize=10) 109 | ax3.set_ylabel("Actual", fontsize=10) 110 | 111 | plt.tight_layout(pad=3.0) 112 | plt.show() 113 | 114 | def plot_what_matters(rf_model,gb_model,xgb_model, months): 115 | rf_importance = rf_model.feature_importances_ 116 | gb_importance = gb_model.feature_importances_ 117 | xgb_importance = xgb_model.feature_importances_ 118 | 119 | fig,(ax1,ax2,ax3)=plt.subplots(1,3,figsize=(20,6)) 120 | 121 | sns.barplot(x=rf_importance, y=months,ax=ax1) 122 | ax1.set_title("Random Forest Feature Importance", fontsize=12, pad=10) 123 | ax1.set_xlabel("Importance", fontsize=10) 124 | ax1.set_ylabel("Month", fontsize=10) 125 | ax1.tick_params(axis='y',labelsize=8) 126 | 127 | sns.barplot(x=gb_importance, y=months,ax=ax2) 128 | ax2.set_title("Gradient Boosting Feature Importance", fontsize=12, pad=10) 129 | ax2.set_xlabel("Importance", fontsize=10) 130 | ax2.set_ylabel("Month", fontsize=10) 131 | ax2.tick_params(axis='y',labelsize=8) 132 | 133 | sns.barplot(x=xgb_importance, y=months, ax=ax3) 134 | ax3.set_title("XGBoost Feature Importance", fontsize=12, pad=10) 135 | ax3.set_xlabel("Importance", fontsize=10) 136 | ax3.set_ylabel("Month", fontsize=10) 137 | ax3.tick_params(axis='y', labelsize=8) 138 | 139 | plt.tight_layout(pad=3.0) 140 | plt.show() 141 | def plot_correlation_heatmap(data): 142 | monthly_data=data[["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]] 143 | corr_matrix=monthly_data.corr() 144 | plt.figure(figsize=(10,8)) 145 | sns.heatmap(corr_matrix,annot=True,cmap="coolwarm",fmt=".2f") 146 | plt.title("Correlation Heatmap of Monthly Rainfall") 147 | plt.show() 148 | 149 | def plot_avg_rainfall_heatmap(data): 150 | months = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] 151 | avg_rainfall = data.groupby("SUBDIVISION")[months].mean() 152 | fig,ax=plt.subplots(figsize=(18,14)) 153 | sns.heatmap(avg_rainfall, cmap="YlGnBu", annot=False,ax=ax) 154 | ax.set_title("Average Monthly Rainfall by Subdivision",fontsize=12,pad=20) 155 | ax.set_xlabel("Month",fontsize=8,labelpad=10) 156 | ax.set_ylabel("Subdivision",fontsize=8,labelpad=10) 157 | ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=12) 158 | ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=12) 159 | 160 | plt.subplots_adjust(left=0.2, right=0.95, top=0.95, bottom=0.25) 161 | plt.show() 162 | 163 | def plot_3d_rainfall(data, area): 164 | subset = data[data["SUBDIVISION"] == area].copy() 165 | 166 | years = [] 167 | months= [] 168 | rainfall = [] 169 | 170 | month_map = {"JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6, 171 | "JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12} 172 | 173 | for _, row in subset.iterrows(): 174 | year = row["YEAR"] 175 | for month_name, month_num in month_map.items(): 176 | years.append(year) 177 | months.append(month_num) 178 | rainfall.append(row[month_name]) 179 | 180 | fig = plt.figure(figsize=(10, 8)) 181 | ax = fig.add_subplot(111, projection='3d') 182 | 183 | scatter = ax.scatter(years, months, rainfall, c=rainfall, cmap='viridis', s=50) 184 | 185 | ax.set_xlabel("Year") 186 | ax.set_ylabel("Month") 187 | ax.set_zlabel("Rainfall (mm)") 188 | ax.set_title(f"3D Rainfall Plot - {area}", fontsize=12, pad=20) 189 | 190 | ax.set_yticks(list(month_map.values())) 191 | ax.set_yticklabels(list(month_map.keys())) 192 | 193 | plt.colorbar(scatter, label="Rainfall (mm)") 194 | 195 | plt.show() 196 | 197 | 198 | def forecast_annual_rainfall_arima(df, forecast_years=10): 199 | 200 | rainfall_series = df.groupby("YEAR")["ANNUAL"].mean() 201 | rainfall_series = rainfall_series.sort_index() 202 | rainfall_series.index = pd.Index(rainfall_series.index.astype(int)) 203 | 204 | model = ARIMA(rainfall_series, order=(2, 1, 2)) 205 | model_fit = model.fit() 206 | 207 | forecast = model_fit.forecast(steps=forecast_years) 208 | 209 | last_year = rainfall_series.index[-1] 210 | 211 | future_years = list(range(df["YEAR"].max() + 1, df["YEAR"].max() + forecast_years + 1)) 212 | 213 | plt.figure(figsize=(10, 5)) 214 | plt.plot(rainfall_series.index, rainfall_series.values, label="Historical Rainfall", marker="o") 215 | plt.plot(future_years, forecast, label="Forecasted Rainfall", linestyle="dashed", marker="o", color="red") 216 | plt.xlabel("Year") 217 | plt.ylabel("Annual Rainfall (mm)") 218 | plt.title(f"Annual Rainfall Forecast (Next {forecast_years} Years)") 219 | plt.legend() 220 | plt.grid(True) 221 | plt.tight_layout() 222 | plt.show() 223 | 224 | print("\nForecasted Rainfall:") 225 | for year, rain in zip(future_years, forecast): 226 | print(f"{year}: {rain:.2f} mm") 227 | 228 | 229 | def main(): 230 | print("Project starting...") 231 | file_path = "Sub_Division_IMD_2017.csv" 232 | df=load_and_clean(file_path) 233 | print("Data preview") 234 | print(df.head()) 235 | 236 | plot_yearly_trend(df,"Andaman & Nicobar Islands") 237 | plot_monthly_spread(df) 238 | plot_3d_rainfall(df, "Andaman & Nicobar Islands") 239 | 240 | rf_model,gb_model,xgb_model, X_test ,y_test= train_rain_model(df) 241 | plot_confusion_matrix(rf_model,gb_model,xgb_model,X_test,y_test) 242 | 243 | months = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"] 244 | plot_what_matters(rf_model,gb_model,xgb_model, months) 245 | 246 | plot_correlation_heatmap(df) 247 | plot_avg_rainfall_heatmap(df) 248 | 249 | forecast_annual_rainfall_arima(df, forecast_years=10) 250 | 251 | 252 | if __name__ == "__main__": 253 | main() 254 | 255 | --------------------------------------------------------------------------------