├── Geolocation project.pptx └── geolocation code.py /Geolocation project.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MONIKAGUNASEELAN/Implementation-of-Geolocation-Extraction-Data-Analytics-Using-ML/f423ca70b884ff957d220b5fc1dc849cad9c8a2e/Geolocation project.pptx -------------------------------------------------------------------------------- /geolocation code.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Copy of Untitled38.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/1I4cNQHW56fsjiD1VoajHuqzL8cOREu_V 8 | """ 9 | 10 | from geopy.geocoders import Nominatim 11 | 12 | geolocator = Nominatim(user_agent="MyGeocoder/1.0") 13 | 14 | zipcode = input("Enter the zipcode: ") 15 | print("\nZipcode:", zipcode) 16 | 17 | try: 18 | location = geolocator.geocode(zipcode) 19 | if location: 20 | print("Details of the said pincode:") 21 | print(location.address) 22 | else: 23 | print("Location not found.") 24 | except Exception as e: 25 | print("Error:", str(e)) 26 | 27 | from geopy.geocoders import Nominatim 28 | 29 | def geo_coder(zipcode): 30 | geolocator = Nominatim(user_agent="MyGeocoder/1.0") 31 | 32 | try: 33 | location = geolocator.geocode(zipcode) 34 | if location: 35 | print("Details of the said pincode:") 36 | print(location.address) 37 | else: 38 | print("Location not found.") 39 | except Exception as e: 40 | print("Error:", str(e)) 41 | 42 | 43 | zipcode = input("Enter the zipcode: ") 44 | print("\nZipcode:", zipcode) 45 | result = geo_coder(zipcode) 46 | 47 | """# **ML CODE**""" 48 | 49 | "/content/drive/MyDrive/Zipcodes/allCountriesCSV.csv" 50 | 51 | # suppress display of warnings 52 | import warnings 53 | warnings.filterwarnings("ignore") 54 | 55 | # 'Pandas' is used for data manipulation and analysis 56 | import pandas as pd 57 | 58 | # 'Numpy' is used for mathematical operations on large, multi-dimensional arrays and matrices 59 | import numpy as np 60 | 61 | # 'Matplotlib' is a data visualization library for 2D and 3D plots, built on numpy 62 | import matplotlib.pyplot as plt 63 | from matplotlib.colors import ListedColormap 64 | 65 | # 'Seaborn' is based on matplotlib; used for plotting statistical graphics 66 | import seaborn as sns 67 | 68 | # import 'is_string_dtype' to check if the type of input is string 69 | from pandas.api.types import is_string_dtype 70 | 71 | # import various functions to perform classification 72 | from sklearn.preprocessing import StandardScaler 73 | from sklearn.model_selection import train_test_split 74 | from sklearn import metrics 75 | from sklearn.linear_model import LinearRegression 76 | 77 | # read the excel data file 78 | df = pd.read_csv("/content/drive/MyDrive/Zipcodes/allCountriesCSV.csv") 79 | 80 | # display the top 5 rows of the dataframe 81 | # df.head() 82 | 83 | data = df.drop(["COMMUNITY", "SHORT_COMMUNITY", "SHORT_STATE", "SHORT_COUNTY", "ACCURACY", "STATE", "COUNTY", "COUNTRY", "CITY"], axis=1) 84 | 85 | import pandas as pd 86 | import numpy as np 87 | import re 88 | 89 | def convert_to_int(value): 90 | cleaned_value = re.sub(r'\D', '', str(value)) # Remove non-numeric characters 91 | return pd.to_numeric(cleaned_value, errors='coerce') 92 | 93 | data['Converted_POSTAL_CODE'] = data['POSTAL_CODE'].apply(convert_to_int) 94 | 95 | # Now, the "Converted_POSTAL_CODE" column contains integers where possible and NaN for non-convertible values 96 | print(data) 97 | 98 | # import re 99 | # def convert_to_int_or_keep(value): 100 | # cleaned_value = re.sub(r'\D', '', str(value)) # Remove non-numeric characters 101 | # try: 102 | # return int(cleaned_value) 103 | # except (ValueError, TypeError): 104 | # return value 105 | 106 | # data['Converted_POSTAL_CODE'] = data['POSTAL_CODE'].apply(convert_to_int_or_keep) 107 | 108 | # # Now, the "Converted_POSTAL_CODE" column contains integer values (where possible) and non-integer values 109 | # print(data) 110 | 111 | data = data.dropna(subset=['Converted_POSTAL_CODE']) 112 | 113 | # Now, the DataFrame contains only rows without NaN values in the "Converted_POSTAL_CODE" column 114 | print(data) 115 | 116 | data.isna().sum() 117 | 118 | zipcode_to_check = 637020 119 | exists_in_data_cleaned = zipcode_to_check in data['Converted_POSTAL_CODE'] 120 | 121 | if exists_in_data_cleaned: 122 | print(f"The value {zipcode_to_check} exists in the data DataFrame.") 123 | else: 124 | print(f"The value {zipcode_to_check} does not exist in the data_c DataFrame.") 125 | 126 | data_cleaned = data 127 | 128 | data_cleaned.dtypes 129 | 130 | data_cleaned.shape 131 | 132 | import pandas as pd 133 | from sklearn.impute import SimpleImputer 134 | from sklearn.model_selection import train_test_split 135 | from sklearn.linear_model import LinearRegression 136 | from sklearn.metrics import mean_squared_error 137 | 138 | # Load your dataset (replace 'your_dataset.csv' with the actual dataset filename) 139 | # data = pd.read_csv('your_dataset.csv') 140 | 141 | # Preprocess the data (e.g., handle missing values, remove duplicates) 142 | 143 | # Split data into training and testing sets 144 | X = data_cleaned[['Converted_POSTAL_CODE']] 145 | y = data_cleaned[['LATITUDE', 'LONGITUDE']] 146 | 147 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 148 | 149 | # # Create an imputer that fills NaN values with the mean 150 | # imputer = SimpleImputer(strategy='mean') 151 | 152 | # # Fit the imputer on your X_train data and transform both X_train and X_test 153 | # X_train = imputer.fit_transform(X_train) 154 | # X_test = imputer.transform(X_test) 155 | 156 | # Train the model 157 | model = LinearRegression() 158 | model.fit(X_train, y_train) 159 | 160 | # Make predictions on the test set 161 | y_pred = model.predict(X_test) 162 | 163 | # Evaluate the model 164 | mse = mean_squared_error(y_test, y_pred) 165 | print(f"Mean Squared Error: {mse}") 166 | 167 | # You can now use the trained model to predict geolocations based on zip codes. 168 | 169 | X_train 170 | 171 | y_train 172 | 173 | zip_code_2d = np.array(6025).reshape(1, -1) 174 | 175 | print(model.predict(zip_code_2d)) 176 | 177 | new_prediction = print(model.predict(zip_code_2d)) 178 | 179 | import duckdb 180 | duckdb.query("SELECT * FROM data_cleaned where Converted_POSTAL_CODE=637020") # returns a result dataframe 181 | 182 | """Using Random forest""" 183 | 184 | # import pandas as pd 185 | # from sklearn.model_selection import train_test_split, RandomizedSearchCV 186 | # from sklearn.ensemble import RandomForestRegressor 187 | 188 | # # # Sample data 189 | # # data = pd.DataFrame({'Converted_POSTAL_CODE': [1234, 5678, 9012, 3456, 7890], 190 | # # 'LATITUDE': [37.7749, 34.0522, 40.7128, 41.8781, 33.6846], 191 | # # 'LONGITUDE': [-122.4194, -118.2437, -74.0060, -87.6298, -117.8265]}) 192 | 193 | # # Split the data into X and y 194 | # X = data_cleaned[['Converted_POSTAL_CODE']] 195 | # y = data_cleaned[['LATITUDE', 'LONGITUDE']] 196 | 197 | # # Split the data into a training set and a testing set 198 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 199 | 200 | # # Define the model 201 | # model = RandomForestRegressor() 202 | 203 | # # Hyperparameter grid for RandomizedSearchCV 204 | # param_dist = { 205 | # 'n_estimators': [10, 50, 100], 206 | # 'max_depth': [None, 10, 20], 207 | # 'min_samples_split': [2, 5], 208 | # 'min_samples_leaf': [1, 2] 209 | # } 210 | 211 | # # Create RandomizedSearchCV object with a limited number of iterations and fewer cross-validation folds 212 | # random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, scoring='neg_mean_squared_error', cv=3) 213 | 214 | # # Fit the model with RandomizedSearchCV 215 | # random_search.fit(X_train, y_train) 216 | 217 | # # Get the best hyperparameters 218 | # best_params = random_search.best_params_ 219 | # print("Best Hyperparameters:", best_params) 220 | 221 | # # Evaluate the model on the test set 222 | # best_model = random_search.best_estimator_ 223 | # y_pred = best_model.predict(X_test) 224 | 225 | # # Calculate evaluation metrics (you can use other metrics) 226 | # from sklearn.metrics import mean_squared_error 227 | # mse = mean_squared_error(y_test, y_pred) 228 | # print("Mean Squared Error:", mse) 229 | 230 | # import pandas as pd 231 | # import numpy as np 232 | # from sklearn.model_selection import GridSearchCV, train_test_split 233 | # from sklearn.ensemble import RandomForestRegressor 234 | # from sklearn.metrics import mean_squared_error 235 | 236 | # # Sample data 237 | # # data = pd.DataFrame({ 238 | # # 'Converted_POSTAL_CODE': [1001, 2002, 3003, 4004, 5005], 239 | # # 'LATITUDE': [42.3601, 34.0522, 40.7128, 29.7604, 39.9042], 240 | # # 'LONGITUDE': [-71.0589, -118.2437, -74.0060, -95.3698, -75.1652] 241 | # # }) 242 | 243 | # X = data_cleaned[['Converted_POSTAL_CODE']] 244 | # y = data_cleaned[['LATITUDE', 'LONGITUDE']] 245 | 246 | # # Split data into training and testing sets 247 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 248 | 249 | # # Define the regression model (Random Forest in this example) 250 | # model = RandomForestRegressor() 251 | 252 | # # Define hyperparameter grid for Grid Search 253 | # param_grid = { 254 | # 'n_estimators': [10, 50, 100, 200], 255 | # 'max_depth': [None, 10, 20, 30], 256 | # 'min_samples_split': [2, 5, 10] 257 | # } 258 | 259 | # # Create Grid Search with cross-validation 260 | # grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1) 261 | 262 | # # Fit the grid search to the data 263 | # grid_search.fit(X_train, y_train) 264 | 265 | # # Get the best hyperparameters 266 | # best_params = grid_search.best_params_ 267 | 268 | # # Train the model with the best hyperparameters 269 | # best_model = RandomForestRegressor(**best_params) 270 | # best_model.fit(X_train, y_train) 271 | 272 | # # Make predictions on the test set 273 | # y_pred = best_model.predict(X_test) 274 | 275 | # # Evaluate the model using Mean Squared Error (MSE) 276 | # mse = mean_squared_error(y_test, y_pred) 277 | # print("Best Hyperparameters:", best_params) 278 | # print("Mean Squared Error:", mse) 279 | 280 | """# **Using Tensorflow**""" 281 | 282 | import tensorflow as tf 283 | import pandas as pd 284 | from sklearn.model_selection import train_test_split 285 | from sklearn.preprocessing import StandardScaler 286 | 287 | # Load your dataset 288 | 289 | 290 | # Prepare the data 291 | X = data[['Converted_POSTAL_CODE']] 292 | y = data[['LATITUDE', 'LONGITUDE']] 293 | 294 | # Split the data into training and testing sets 295 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 296 | 297 | # Data preprocessing (standardization) 298 | scaler = StandardScaler() 299 | X_train = scaler.fit_transform(X_train) 300 | X_test = scaler.transform(X_test) 301 | 302 | # Define a neural network model 303 | model = tf.keras.models.Sequential([ 304 | tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)), 305 | tf.keras.layers.Dense(64, activation='relu'), 306 | tf.keras.layers.Dense(2) # Two output neurons for latitude and longitude 307 | ]) 308 | 309 | # Compile the model 310 | model.compile(optimizer='adam', loss='mean_squared_error') 311 | 312 | # Train the model 313 | model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2) 314 | 315 | # Evaluate the model 316 | loss = model.evaluate(X_test, y_test) 317 | print("Test Loss:", loss) 318 | 319 | # Make predictions 320 | predictions = model.predict(X_test) 321 | 322 | # Optionally, you can convert the predictions back to original scale if you standardized the data earlier. 323 | # predictions = scaler.inverse_transform(predictions) 324 | 325 | # Now you can use the model for geolocation predictions using zip codes. 326 | 327 | zip_code_2d = np.array(6025).reshape(1, -1) 328 | 329 | print(model.predict(zip_code_2d)) 330 | 331 | import pickle 332 | 333 | # Your trained model (replace with your model object) 334 | model = model 335 | 336 | # Specify the file path where you want to save the model 337 | model_filename = "model.pkl" 338 | 339 | # Save the model to the file 340 | with open(model_filename, 'wb') as model_file: 341 | pickle.dump(model, model_file) --------------------------------------------------------------------------------