├── LICENSE
├── README.md
├── Sub_Division_IMD_2017.csv
└── rainfall.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 gobarihimanshu071
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Rainfall Analysis and Prediction System
 2 | 
 3 | A comprehensive Python-based system for analyzing and predicting rainfall patterns across different subdivisions of India using machine learning techniques.
 4 | 
 5 | ## Overview
 6 | 
 7 | This project provides tools for:
 8 | - Loading and cleaning rainfall data
 9 | - Visualizing rainfall patterns over time
10 | - Analyzing monthly rainfall distributions
11 | - Training machine learning models to predict rainfall patterns
12 | - Generating rainfall forecasts using ARIMA models
13 | - Creating various visualizations including heatmaps and 3D plots
14 | 
15 | ## Features
16 | 
17 | - **Data Processing**
18 |   - Automated data cleaning and preprocessing
19 |   - Handling missing values using median imputation
20 |   - Feature engineering for rainfall classification
21 | 
22 | - **Visualization**
23 |   - Yearly rainfall trends
24 |   - Monthly rainfall distribution analysis
25 |   - Correlation heatmaps
26 |   - 3D rainfall visualization
27 |   - Average rainfall heatmaps
28 | 
29 | - **Machine Learning**
30 |   - Multiple model implementations (Random Forest, Gradient Boosting, XGBoost)
31 |   - Model evaluation and comparison
32 |   - Feature importance analysis
33 |   - Rainfall classification
34 | 
35 | - **Forecasting**
36 |   - ARIMA-based rainfall forecasting
37 |   - Long-term rainfall predictions
38 | 
39 | ## Requirements
40 | 
41 | - Python 3.x
42 | - pandas
43 | - numpy
44 | - matplotlib
45 | - seaborn
46 | - scikit-learn
47 | - xgboost
48 | - statsmodels
49 | 
50 | ## Installation
51 | 
52 | 1. Clone this repository
53 | 2. Install the required packages:
54 | ```bash
55 | pip install pandas numpy matplotlib seaborn scikit-learn xgboost statsmodels
56 | ```
57 | 
58 | ## Usage
59 | 
60 | The main script `rainfall.py` contains several functions for different analyses:
61 | 
62 | ```python
63 | # Load and clean data
64 | data = load_and_clean("Sub_Division_IMD_2017.csv")
65 | 
66 | # Visualize yearly trends
67 | plot_yearly_trend(data, "KERALA")
68 | 
69 | # Analyze monthly patterns
70 | plot_monthly_spread(data)
71 | 
72 | # Train machine learning models
73 | train_rain_model(data)
74 | 
75 | # Generate forecasts
76 | forecast_annual_rainfall_arima(data, forecast_years=10)
77 | ```
78 | 
79 | ## Data
80 | 
81 | The project uses the `Sub_Division_IMD_2017.csv` dataset, which contains:
82 | - Monthly rainfall data
83 | - Annual rainfall totals
84 | - Subdivision information
85 | - Yearly records
86 | 
87 | ## Contributing
88 | 
89 | Contributions are welcome! Please feel free to submit a Pull Request.
90 | 
91 | ## License
92 | 
93 | This project is open source and available under the MIT License.
94 | 


--------------------------------------------------------------------------------
/rainfall.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
  6 | from sklearn.model_selection import train_test_split
  7 | from sklearn.metrics import accuracy_score,precision_score
  8 | from sklearn.metrics import confusion_matrix
  9 | import sklearn
 10 | from xgboost import XGBClassifier
 11 | from mpl_toolkits.mplot3d import Axes3D
 12 | from statsmodels.tsa.arima.model import ARIMA
 13 | 
 14 | def load_and_clean(file_path):
 15 |     data=pd.read_csv(file_path,na_values=["NA"])
 16 |     print("Data size:", data.shape)
 17 |     print("Columns: ",data.columns.tolist())
 18 | 
 19 |     data.dropna(subset=["ANNUAL"],inplace=True)
 20 |     print("Rows after dropping missing Annual: ", len(data))
 21 | 
 22 |     months = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
 23 |     for  month in months:
 24 |         data[month]=data.groupby("SUBDIVISION")[month].transform(lambda x: x.fillna(x.median()))
 25 | 
 26 |     data["High_Rainfall"]=(data["ANNUAL"]>data["ANNUAL"].quantile(0.75)).astype(int)
 27 |     return data   
 28 | 
 29 | def plot_yearly_trend(data,area):
 30 |     subset = data[data["SUBDIVISION"]== area]
 31 |     plt.figure(figsize=(10,5))
 32 |     plt.plot(subset["YEAR"], subset["ANNUAL"], color="blue")
 33 |     plt.title(f"Rainfall over time- {area}")
 34 |     plt.xlabel("Year")
 35 |     plt.ylabel("Rainfall (mm)")
 36 |     plt.grid(True)
 37 |     plt.show()
 38 | 
 39 | def plot_monthly_spread(data):
 40 |     months= data[["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]]
 41 |     plt.figure(figsize=(12,6))
 42 |     sns.boxplot(data=months)
 43 |     plt.title("How rainfall varies by month")
 44 |     plt.xlabel("Month")
 45 |     plt.ylabel("Rainfall (mm)")
 46 |     plt.xticks(rotation=45)
 47 |     plt.show()
 48 | 
 49 | def train_rain_model(data):
 50 |     months = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
 51 |     X=data[months]
 52 |     y=data["High_Rainfall"]
 53 | 
 54 |     X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2 , random_state=42)
 55 | 
 56 |     rf_model=RandomForestClassifier(n_estimators=100, random_state=42)
 57 |     rf_model.fit(X_train, y_train)
 58 |     rf_pred=rf_model.predict(X_test)
 59 |     rf_accuracy=accuracy_score(y_test,rf_pred)
 60 |     rf_precision=precision_score(y_test,rf_pred)
 61 |     print("Random Forest Results")
 62 |     print(f"Accuracy: {rf_accuracy:.2f}")
 63 |     print(f"Precision: {rf_precision:.2f}")
 64 | 
 65 |     gb_model=GradientBoostingClassifier(n_estimators=100,random_state=42)
 66 |     gb_model.fit(X_train,y_train)
 67 |     gb_pred=gb_model.predict(X_test)
 68 |     gb_accuracy=accuracy_score(y_test,gb_pred)
 69 |     gb_precision=precision_score(y_test,gb_pred)
 70 | 
 71 |     print("Gradient Boosting Results")
 72 |     print(f"Accuracy: {gb_accuracy:.2f}")
 73 |     print(f"Precision: {gb_precision:.2f}")
 74 | 
 75 |     xgb_model=XGBClassifier(n_estimators=100,random_state=42,eval_metric='logloss')
 76 |     xgb_model.fit(X_train,y_train)
 77 |     xgb_pred=xgb_model.predict(X_test)
 78 |     xgb_accuracy=accuracy_score(y_test,xgb_pred)
 79 |     xgb_precision = precision_score(y_test,xgb_pred)
 80 |     print("XGBBoost Results:")
 81 |     print(f" Accuracy:{xgb_accuracy:.2f}")
 82 |     print(f" Precision:{xgb_precision:.2f}")
 83 | 
 84 |     return rf_model,gb_model,xgb_model, X_test, y_test
 85 | 
 86 | def plot_confusion_matrix(rf_model,gb_model,xgb_model, X_test, y_test):
 87 |     rf_pred= rf_model.predict(X_test)
 88 |     gb_pred= gb_model.predict(X_test)
 89 |     xgb_pred=xgb_model.predict(X_test)
 90 | 
 91 |     rf_cm=confusion_matrix(y_test, rf_pred)
 92 |     gb_cm=confusion_matrix(y_test, gb_pred)
 93 |     xgb_cm=confusion_matrix(y_test, xgb_pred)
 94 | 
 95 |     fig, (ax1,ax2,ax3)=plt.subplots(1,3,figsize=(18,6))
 96 |     sns.heatmap(rf_cm,annot=True,fmt="d",cmap="Blues",xticklabels=["Low","High"],yticklabels=["Low","High"],ax=ax1)
 97 |     ax1.set_title("Random Forest Confusion Matrix", fontsize=12, pad=10)
 98 |     ax1.set_xlabel("Predicted", fontsize=10)
 99 |     ax1.set_ylabel("Actual", fontsize=10)
100 | 
101 |     sns.heatmap(gb_cm,annot=True,fmt="d",cmap="Blues",xticklabels=["Low","High"],yticklabels=["Low","High"],ax=ax2)
102 |     ax2.set_title("Gradient Boosting Confusion Matrix", fontsize=12, pad=10)
103 |     ax2.set_xlabel("Predicted", fontsize=10)
104 |     ax2.set_ylabel("Actual", fontsize=10)
105 | 
106 |     sns.heatmap(xgb_cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Low", "High"], yticklabels=["Low", "High"], ax=ax3)
107 |     ax3.set_title("XGBoost Confusion Matrix", fontsize=12, pad=10)
108 |     ax3.set_xlabel("Predicted", fontsize=10)
109 |     ax3.set_ylabel("Actual", fontsize=10)
110 | 
111 |     plt.tight_layout(pad=3.0)
112 |     plt.show()
113 | 
114 | def plot_what_matters(rf_model,gb_model,xgb_model, months):
115 |     rf_importance = rf_model.feature_importances_
116 |     gb_importance = gb_model.feature_importances_
117 |     xgb_importance = xgb_model.feature_importances_
118 | 
119 |     fig,(ax1,ax2,ax3)=plt.subplots(1,3,figsize=(20,6))
120 | 
121 |     sns.barplot(x=rf_importance, y=months,ax=ax1)
122 |     ax1.set_title("Random Forest Feature Importance", fontsize=12, pad=10)
123 |     ax1.set_xlabel("Importance", fontsize=10)
124 |     ax1.set_ylabel("Month", fontsize=10)
125 |     ax1.tick_params(axis='y',labelsize=8)
126 | 
127 |     sns.barplot(x=gb_importance, y=months,ax=ax2)
128 |     ax2.set_title("Gradient Boosting Feature Importance", fontsize=12, pad=10)
129 |     ax2.set_xlabel("Importance", fontsize=10)
130 |     ax2.set_ylabel("Month", fontsize=10)
131 |     ax2.tick_params(axis='y',labelsize=8)
132 | 
133 |     sns.barplot(x=xgb_importance, y=months, ax=ax3)
134 |     ax3.set_title("XGBoost Feature Importance", fontsize=12, pad=10)
135 |     ax3.set_xlabel("Importance", fontsize=10)
136 |     ax3.set_ylabel("Month", fontsize=10)
137 |     ax3.tick_params(axis='y', labelsize=8)
138 | 
139 |     plt.tight_layout(pad=3.0)
140 |     plt.show()
141 | def plot_correlation_heatmap(data):
142 |     monthly_data=data[["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]]
143 |     corr_matrix=monthly_data.corr()
144 |     plt.figure(figsize=(10,8))
145 |     sns.heatmap(corr_matrix,annot=True,cmap="coolwarm",fmt=".2f")
146 |     plt.title("Correlation Heatmap of Monthly Rainfall")
147 |     plt.show()
148 | 
149 | def plot_avg_rainfall_heatmap(data):
150 |     months = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
151 |     avg_rainfall = data.groupby("SUBDIVISION")[months].mean()
152 |     fig,ax=plt.subplots(figsize=(18,14))
153 |     sns.heatmap(avg_rainfall, cmap="YlGnBu", annot=False,ax=ax)
154 |     ax.set_title("Average Monthly Rainfall by Subdivision",fontsize=12,pad=20)
155 |     ax.set_xlabel("Month",fontsize=8,labelpad=10)
156 |     ax.set_ylabel("Subdivision",fontsize=8,labelpad=10)
157 |     ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=12)
158 |     ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=12)
159 | 
160 |     plt.subplots_adjust(left=0.2, right=0.95, top=0.95, bottom=0.25)
161 |     plt.show()
162 | 
163 | def plot_3d_rainfall(data, area):
164 |     subset = data[data["SUBDIVISION"] == area].copy()
165 | 
166 |     years = []
167 |     months= []
168 |     rainfall = []
169 | 
170 |     month_map = {"JAN": 1, "FEB": 2, "MAR": 3, "APR": 4, "MAY": 5, "JUN": 6, 
171 |                  "JUL": 7, "AUG": 8, "SEP": 9, "OCT": 10, "NOV": 11, "DEC": 12}
172 |     
173 |     for _, row in subset.iterrows():
174 |         year = row["YEAR"]
175 |         for month_name, month_num in month_map.items():
176 |             years.append(year)
177 |             months.append(month_num)
178 |             rainfall.append(row[month_name])
179 | 
180 |     fig = plt.figure(figsize=(10, 8))
181 |     ax = fig.add_subplot(111, projection='3d')
182 | 
183 |     scatter = ax.scatter(years, months, rainfall, c=rainfall, cmap='viridis', s=50)
184 | 
185 |     ax.set_xlabel("Year")
186 |     ax.set_ylabel("Month")
187 |     ax.set_zlabel("Rainfall (mm)")
188 |     ax.set_title(f"3D Rainfall Plot - {area}", fontsize=12, pad=20)
189 | 
190 |     ax.set_yticks(list(month_map.values()))
191 |     ax.set_yticklabels(list(month_map.keys()))
192 |     
193 |     plt.colorbar(scatter, label="Rainfall (mm)")
194 |     
195 |     plt.show()
196 | 
197 | 
198 | def forecast_annual_rainfall_arima(df, forecast_years=10):
199 | 
200 |     rainfall_series = df.groupby("YEAR")["ANNUAL"].mean()
201 |     rainfall_series = rainfall_series.sort_index()
202 |     rainfall_series.index = pd.Index(rainfall_series.index.astype(int))
203 | 
204 |     model = ARIMA(rainfall_series, order=(2, 1, 2))  
205 |     model_fit = model.fit()
206 | 
207 |     forecast = model_fit.forecast(steps=forecast_years)
208 | 
209 |     last_year = rainfall_series.index[-1]
210 | 
211 |     future_years = list(range(df["YEAR"].max() + 1, df["YEAR"].max() + forecast_years + 1))
212 | 
213 |     plt.figure(figsize=(10, 5))
214 |     plt.plot(rainfall_series.index, rainfall_series.values, label="Historical Rainfall", marker="o")
215 |     plt.plot(future_years, forecast, label="Forecasted Rainfall", linestyle="dashed", marker="o", color="red")
216 |     plt.xlabel("Year")
217 |     plt.ylabel("Annual Rainfall (mm)")
218 |     plt.title(f"Annual Rainfall Forecast (Next {forecast_years} Years)")
219 |     plt.legend()
220 |     plt.grid(True)
221 |     plt.tight_layout()
222 |     plt.show()
223 | 
224 |     print("\nForecasted Rainfall:")
225 |     for year, rain in zip(future_years, forecast):
226 |         print(f"{year}: {rain:.2f} mm")
227 | 
228 | 
229 | def main():
230 |     print("Project starting...")
231 |     file_path = "Sub_Division_IMD_2017.csv"
232 |     df=load_and_clean(file_path)
233 |     print("Data preview")
234 |     print(df.head())
235 | 
236 |     plot_yearly_trend(df,"Andaman & Nicobar Islands")
237 |     plot_monthly_spread(df)
238 |     plot_3d_rainfall(df, "Andaman & Nicobar Islands")
239 | 
240 |     rf_model,gb_model,xgb_model, X_test ,y_test= train_rain_model(df)
241 |     plot_confusion_matrix(rf_model,gb_model,xgb_model,X_test,y_test)
242 | 
243 |     months = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
244 |     plot_what_matters(rf_model,gb_model,xgb_model, months)
245 | 
246 |     plot_correlation_heatmap(df)
247 |     plot_avg_rainfall_heatmap(df)
248 | 
249 |     forecast_annual_rainfall_arima(df, forecast_years=10)
250 | 
251 | 
252 | if __name__ == "__main__":
253 |     main()
254 | 
255 | 


--------------------------------------------------------------------------------