├── Geolocation project.pptx
└── geolocation code.py


/Geolocation project.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MONIKAGUNASEELAN/Implementation-of-Geolocation-Extraction-Data-Analytics-Using-ML/f423ca70b884ff957d220b5fc1dc849cad9c8a2e/Geolocation project.pptx


--------------------------------------------------------------------------------
/geolocation code.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Copy of Untitled38.ipynb
  3 | 
  4 | Automatically generated by Colaboratory.
  5 | 
  6 | Original file is located at
  7 |     https://colab.research.google.com/drive/1I4cNQHW56fsjiD1VoajHuqzL8cOREu_V
  8 | """
  9 | 
 10 | from geopy.geocoders import Nominatim
 11 | 
 12 | geolocator = Nominatim(user_agent="MyGeocoder/1.0")
 13 | 
 14 | zipcode = input("Enter the zipcode: ")
 15 | print("\nZipcode:", zipcode)
 16 | 
 17 | try:
 18 |     location = geolocator.geocode(zipcode)
 19 |     if location:
 20 |         print("Details of the said pincode:")
 21 |         print(location.address)
 22 |     else:
 23 |         print("Location not found.")
 24 | except Exception as e:
 25 |     print("Error:", str(e))
 26 | 
 27 | from geopy.geocoders import Nominatim
 28 | 
 29 | def geo_coder(zipcode):
 30 |   geolocator = Nominatim(user_agent="MyGeocoder/1.0")
 31 | 
 32 |   try:
 33 |       location = geolocator.geocode(zipcode)
 34 |       if location:
 35 |           print("Details of the said pincode:")
 36 |           print(location.address)
 37 |       else:
 38 |           print("Location not found.")
 39 |   except Exception as e:
 40 |       print("Error:", str(e))
 41 | 
 42 | 
 43 | zipcode = input("Enter the zipcode: ")
 44 | print("\nZipcode:", zipcode)
 45 | result = geo_coder(zipcode)
 46 | 
 47 | """# **ML CODE**"""
 48 | 
 49 | "/content/drive/MyDrive/Zipcodes/allCountriesCSV.csv"
 50 | 
 51 | # suppress display of warnings
 52 | import warnings
 53 | warnings.filterwarnings("ignore")
 54 | 
 55 | # 'Pandas' is used for data manipulation and analysis
 56 | import pandas as pd
 57 | 
 58 | # 'Numpy' is used for mathematical operations on large, multi-dimensional arrays and matrices
 59 | import numpy as np
 60 | 
 61 | # 'Matplotlib' is a data visualization library for 2D and 3D plots, built on numpy
 62 | import matplotlib.pyplot as plt
 63 | from matplotlib.colors import ListedColormap
 64 | 
 65 | # 'Seaborn' is based on matplotlib; used for plotting statistical graphics
 66 | import seaborn as sns
 67 | 
 68 | # import 'is_string_dtype' to check if the type of input is string
 69 | from pandas.api.types import is_string_dtype
 70 | 
 71 | # import various functions to perform classification
 72 | from sklearn.preprocessing import StandardScaler
 73 | from sklearn.model_selection import train_test_split
 74 | from sklearn import metrics
 75 | from sklearn.linear_model import LinearRegression
 76 | 
 77 | # read the excel data file
 78 | df = pd.read_csv("/content/drive/MyDrive/Zipcodes/allCountriesCSV.csv")
 79 | 
 80 | # display the top 5 rows of the dataframe
 81 | # df.head()
 82 | 
 83 | data = df.drop(["COMMUNITY", "SHORT_COMMUNITY", "SHORT_STATE", "SHORT_COUNTY", "ACCURACY", "STATE", "COUNTY", "COUNTRY", "CITY"], axis=1)
 84 | 
 85 | import pandas as pd
 86 | import numpy as np
 87 | import re
 88 | 
 89 | def convert_to_int(value):
 90 |     cleaned_value = re.sub(r'\D', '', str(value))  # Remove non-numeric characters
 91 |     return pd.to_numeric(cleaned_value, errors='coerce')
 92 | 
 93 | data['Converted_POSTAL_CODE'] = data['POSTAL_CODE'].apply(convert_to_int)
 94 | 
 95 | # Now, the "Converted_POSTAL_CODE" column contains integers where possible and NaN for non-convertible values
 96 | print(data)
 97 | 
 98 | # import re
 99 | # def convert_to_int_or_keep(value):
100 | #     cleaned_value = re.sub(r'\D', '', str(value))  # Remove non-numeric characters
101 | #     try:
102 | #         return int(cleaned_value)
103 | #     except (ValueError, TypeError):
104 | #         return value
105 | 
106 | # data['Converted_POSTAL_CODE'] = data['POSTAL_CODE'].apply(convert_to_int_or_keep)
107 | 
108 | # # Now, the "Converted_POSTAL_CODE" column contains integer values (where possible) and non-integer values
109 | # print(data)
110 | 
111 | data = data.dropna(subset=['Converted_POSTAL_CODE'])
112 | 
113 | # Now, the DataFrame contains only rows without NaN values in the "Converted_POSTAL_CODE" column
114 | print(data)
115 | 
116 | data.isna().sum()
117 | 
118 | zipcode_to_check = 637020
119 | exists_in_data_cleaned = zipcode_to_check in data['Converted_POSTAL_CODE']
120 | 
121 | if exists_in_data_cleaned:
122 |     print(f"The value {zipcode_to_check} exists in the data DataFrame.")
123 | else:
124 |     print(f"The value {zipcode_to_check} does not exist in the data_c DataFrame.")
125 | 
126 | data_cleaned = data
127 | 
128 | data_cleaned.dtypes
129 | 
130 | data_cleaned.shape
131 | 
132 | import pandas as pd
133 | from sklearn.impute import SimpleImputer
134 | from sklearn.model_selection import train_test_split
135 | from sklearn.linear_model import LinearRegression
136 | from sklearn.metrics import mean_squared_error
137 | 
138 | # Load your dataset (replace 'your_dataset.csv' with the actual dataset filename)
139 | # data = pd.read_csv('your_dataset.csv')
140 | 
141 | # Preprocess the data (e.g., handle missing values, remove duplicates)
142 | 
143 | # Split data into training and testing sets
144 | X = data_cleaned[['Converted_POSTAL_CODE']]
145 | y = data_cleaned[['LATITUDE', 'LONGITUDE']]
146 | 
147 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
148 | 
149 | # # Create an imputer that fills NaN values with the mean
150 | # imputer = SimpleImputer(strategy='mean')
151 | 
152 | # # Fit the imputer on your X_train data and transform both X_train and X_test
153 | # X_train = imputer.fit_transform(X_train)
154 | # X_test = imputer.transform(X_test)
155 | 
156 | # Train the model
157 | model = LinearRegression()
158 | model.fit(X_train, y_train)
159 | 
160 | # Make predictions on the test set
161 | y_pred = model.predict(X_test)
162 | 
163 | # Evaluate the model
164 | mse = mean_squared_error(y_test, y_pred)
165 | print(f"Mean Squared Error: {mse}")
166 | 
167 | # You can now use the trained model to predict geolocations based on zip codes.
168 | 
169 | X_train
170 | 
171 | y_train
172 | 
173 | zip_code_2d = np.array(6025).reshape(1, -1)
174 | 
175 | print(model.predict(zip_code_2d))
176 | 
177 | new_prediction = print(model.predict(zip_code_2d))
178 | 
179 | import duckdb
180 | duckdb.query("SELECT * FROM data_cleaned where Converted_POSTAL_CODE=637020") # returns a result dataframe
181 | 
182 | """Using Random forest"""
183 | 
184 | # import pandas as pd
185 | # from sklearn.model_selection import train_test_split, RandomizedSearchCV
186 | # from sklearn.ensemble import RandomForestRegressor
187 | 
188 | # # # Sample data
189 | # # data = pd.DataFrame({'Converted_POSTAL_CODE': [1234, 5678, 9012, 3456, 7890],
190 | # #                      'LATITUDE': [37.7749, 34.0522, 40.7128, 41.8781, 33.6846],
191 | # #                      'LONGITUDE': [-122.4194, -118.2437, -74.0060, -87.6298, -117.8265]})
192 | 
193 | # # Split the data into X and y
194 | # X = data_cleaned[['Converted_POSTAL_CODE']]
195 | # y = data_cleaned[['LATITUDE', 'LONGITUDE']]
196 | 
197 | # # Split the data into a training set and a testing set
198 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
199 | 
200 | # # Define the model
201 | # model = RandomForestRegressor()
202 | 
203 | # # Hyperparameter grid for RandomizedSearchCV
204 | # param_dist = {
205 | #     'n_estimators': [10, 50, 100],
206 | #     'max_depth': [None, 10, 20],
207 | #     'min_samples_split': [2, 5],
208 | #     'min_samples_leaf': [1, 2]
209 | # }
210 | 
211 | # # Create RandomizedSearchCV object with a limited number of iterations and fewer cross-validation folds
212 | # random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=5, scoring='neg_mean_squared_error', cv=3)
213 | 
214 | # # Fit the model with RandomizedSearchCV
215 | # random_search.fit(X_train, y_train)
216 | 
217 | # # Get the best hyperparameters
218 | # best_params = random_search.best_params_
219 | # print("Best Hyperparameters:", best_params)
220 | 
221 | # # Evaluate the model on the test set
222 | # best_model = random_search.best_estimator_
223 | # y_pred = best_model.predict(X_test)
224 | 
225 | # # Calculate evaluation metrics (you can use other metrics)
226 | # from sklearn.metrics import mean_squared_error
227 | # mse = mean_squared_error(y_test, y_pred)
228 | # print("Mean Squared Error:", mse)
229 | 
230 | # import pandas as pd
231 | # import numpy as np
232 | # from sklearn.model_selection import GridSearchCV, train_test_split
233 | # from sklearn.ensemble import RandomForestRegressor
234 | # from sklearn.metrics import mean_squared_error
235 | 
236 | # # Sample data
237 | # # data = pd.DataFrame({
238 | # #     'Converted_POSTAL_CODE': [1001, 2002, 3003, 4004, 5005],
239 | # #     'LATITUDE': [42.3601, 34.0522, 40.7128, 29.7604, 39.9042],
240 | # #     'LONGITUDE': [-71.0589, -118.2437, -74.0060, -95.3698, -75.1652]
241 | # # })
242 | 
243 | # X = data_cleaned[['Converted_POSTAL_CODE']]
244 | # y = data_cleaned[['LATITUDE', 'LONGITUDE']]
245 | 
246 | # # Split data into training and testing sets
247 | # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
248 | 
249 | # # Define the regression model (Random Forest in this example)
250 | # model = RandomForestRegressor()
251 | 
252 | # # Define hyperparameter grid for Grid Search
253 | # param_grid = {
254 | #     'n_estimators': [10, 50, 100, 200],
255 | #     'max_depth': [None, 10, 20, 30],
256 | #     'min_samples_split': [2, 5, 10]
257 | # }
258 | 
259 | # # Create Grid Search with cross-validation
260 | # grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
261 | 
262 | # # Fit the grid search to the data
263 | # grid_search.fit(X_train, y_train)
264 | 
265 | # # Get the best hyperparameters
266 | # best_params = grid_search.best_params_
267 | 
268 | # # Train the model with the best hyperparameters
269 | # best_model = RandomForestRegressor(**best_params)
270 | # best_model.fit(X_train, y_train)
271 | 
272 | # # Make predictions on the test set
273 | # y_pred = best_model.predict(X_test)
274 | 
275 | # # Evaluate the model using Mean Squared Error (MSE)
276 | # mse = mean_squared_error(y_test, y_pred)
277 | # print("Best Hyperparameters:", best_params)
278 | # print("Mean Squared Error:", mse)
279 | 
280 | """# **Using Tensorflow**"""
281 | 
282 | import tensorflow as tf
283 | import pandas as pd
284 | from sklearn.model_selection import train_test_split
285 | from sklearn.preprocessing import StandardScaler
286 | 
287 | # Load your dataset
288 | 
289 | 
290 | # Prepare the data
291 | X = data[['Converted_POSTAL_CODE']]
292 | y = data[['LATITUDE', 'LONGITUDE']]
293 | 
294 | # Split the data into training and testing sets
295 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
296 | 
297 | # Data preprocessing (standardization)
298 | scaler = StandardScaler()
299 | X_train = scaler.fit_transform(X_train)
300 | X_test = scaler.transform(X_test)
301 | 
302 | # Define a neural network model
303 | model = tf.keras.models.Sequential([
304 |     tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
305 |     tf.keras.layers.Dense(64, activation='relu'),
306 |     tf.keras.layers.Dense(2)  # Two output neurons for latitude and longitude
307 | ])
308 | 
309 | # Compile the model
310 | model.compile(optimizer='adam', loss='mean_squared_error')
311 | 
312 | # Train the model
313 | model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)
314 | 
315 | # Evaluate the model
316 | loss = model.evaluate(X_test, y_test)
317 | print("Test Loss:", loss)
318 | 
319 | # Make predictions
320 | predictions = model.predict(X_test)
321 | 
322 | # Optionally, you can convert the predictions back to original scale if you standardized the data earlier.
323 | # predictions = scaler.inverse_transform(predictions)
324 | 
325 | # Now you can use the model for geolocation predictions using zip codes.
326 | 
327 | zip_code_2d = np.array(6025).reshape(1, -1)
328 | 
329 | print(model.predict(zip_code_2d))
330 | 
331 | import pickle
332 | 
333 | # Your trained model (replace with your model object)
334 | model = model
335 | 
336 | # Specify the file path where you want to save the model
337 | model_filename = "model.pkl"
338 | 
339 | # Save the model to the file
340 | with open(model_filename, 'wb') as model_file:
341 |     pickle.dump(model, model_file)


--------------------------------------------------------------------------------