├── README.md ├── Load_data.ipynb └── kmeans.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Prediction of Future location using Hidden Markov Model 2 | The aim was to predict the user's future location using the past data. As if we can predict the location of any particular user so we can recommend him about the real time situation of the predicted location. We used Geolife data set having the location of 180 users with date and some other features. So initially we clustered the near by locations then Hidden Markov Model was used to fit the data and predict the most probable future location of a particular user. 3 | 4 | research paper followed : Detecting Meaningful Places and Predicting Locations Using Varied K-Means and Hidden Markov Model 5 | by Neelabh Pant and Ramez Elmasri 6 | 7 | dataset : Geolife_Trajectories 8 | -------------------------------------------------------------------------------- /Load_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Populating the interactive namespace from numpy and matplotlib\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import numpy as np\n", 20 | "import os\n", 21 | "import time, calendar, datetime\n", 22 | "from mpl_toolkits.basemap import Basemap\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "%matplotlib inline\n", 25 | "%pylab inline\n", 26 | "pylab.rcParams['figure.figsize'] = (15, 15)\n", 27 | "import glob\n", 28 | "import pyproj\n", 29 | "import imageio" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "## Remove all warning \n", 41 | "import warnings\n", 42 | "warnings.filterwarnings('ignore')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "raw", 47 | "metadata": {}, 48 | "source": [ 49 | "Reading all subfolder in Data folder" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "os.chdir('/home/gaurav/Desktop/Research Paper/Geolife Trajectories 1.3/Data')\n", 61 | "## Getting list of all users (182 users)\n", 62 | "list_of_folder = glob.glob('*') \n", 63 | "list_of_folder.sort()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "CPU times: user 3min 36s, sys: 3.1 s, total: 3min 39s\n", 78 | "Wall time: 5min 24s\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "%%time\n", 84 | "data=[0]*182\n", 85 | "k=0\n", 86 | "\n", 87 | "for folder in list_of_folder:\n", 88 | " # Changing subfolder or user per loop\n", 89 | " folder_name='/home/gaurav/Desktop/Research Paper/Geolife Trajectories 1.3/Data/'+folder+'/Trajectory'\n", 90 | " # Change directory\n", 91 | " os.chdir(folder_name) \n", 92 | " a=[]\n", 93 | " # List of all files for a particular user \n", 94 | " list_of_files = glob.glob('*.plt')\n", 95 | " for file_name in list_of_files:\n", 96 | " a.append((np.genfromtxt(file_name,delimiter=',',skip_header=6,names=['Latitude','Longitude','Zero','Altitude',\n", 97 | " 'Duration','Date','Time'],\n", 98 | " dtype=[('Latitude','f8'),('Longitude','f8'),('Zero','i8'),('Altitude','f8'),('Duration','f8'),\n", 99 | " ('Date','S10'),('Time','S8')])))\n", 100 | " data[k]=a\n", 101 | " # Switching over next user\n", 102 | " \n", 103 | " os.chdir('/home/gaurav/Desktop/Research Paper/Geolife Trajectories 1.3/Data')\n", 104 | " k=k+1\n", 105 | " " 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "data[0][0][0]" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "#### We have read all the files in Data folder\n", 124 | "data[i][j][k] represent ith user , jth timestamp ,kth row " 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 6, 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "# Let's load files of first user that is in subfolder 000 or data[0][:][:]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 7, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "171" 149 | ] 150 | }, 151 | "execution_count": 7, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "len(data[0]) # Number of observation for user one\n", 158 | "\n" 159 | ] 160 | }, 161 | { 162 | "cell_type": "raw", 163 | "metadata": { 164 | "collapsed": false 165 | }, 166 | "source": [ 167 | "a=6\n", 168 | "lat_user_one=[]\n", 169 | "lon_user_one=[]\n", 170 | "# Additional for loop here will give plot for entire 182 user data\n", 171 | "for i in range (len(data[a])):\n", 172 | " for j in range(len(data[a][i])):\n", 173 | " lat_user_one.append(data[a][i][j]['Latitude'])\n", 174 | " lon_user_one.append(data[a][i][j]['Longitude'])\n", 175 | "map = Basemap(projection='merc', lat_0 =np.mean(lat_user_one) , lon_0 = np.mean(lon_user_one),\n", 176 | " resolution = 'h', area_thresh = 0.01,\n", 177 | " llcrnrlon=min(lon_user_one), llcrnrlat=min(lat_user_one),\n", 178 | " urcrnrlon=max(lon_user_one), urcrnrlat=max(lat_user_one))\n", 179 | "map.drawcoastlines()\n", 180 | "map.drawcountries()\n", 181 | "map.fillcontinents(color = 'coral',lake_color='aqua')\n", 182 | "map.drawmapboundary()\n", 183 | "x,y=map(lon_user_one,lat_user_one)\n", 184 | "map.plot(y,x, 'bo', markersize=5)\n" 185 | ] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.6.0" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 2 209 | } 210 | -------------------------------------------------------------------------------- /kmeans.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Import necessary packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "import numpy as np\n", 21 | "import sys\n", 22 | "import os\n", 23 | "from sklearn.metrics import pairwise_distances" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Implement k-means" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "Let us implement the k-means algorithm. First, we choose an initial set of centroids. A common practice is to choose randomly from the data points.\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "def get_initial_centroids(data, k, seed=None):\n", 49 | " '''Randomly choose k data points as initial centroids'''\n", 50 | " if seed is not None: \n", 51 | " np.random.seed(seed)\n", 52 | " n = data.shape[0] \n", 53 | " rand_indices = np.random.randint(0, n, k)\n", 54 | "\n", 55 | " centroids = data[rand_indices,:].toarray()\n", 56 | " \n", 57 | " return centroids" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "\n", 69 | "def assign_clusters(data, centroids):\n", 70 | " \n", 71 | " \n", 72 | " distances_from_centroids = pairwise_distances(data,centroids)\n", 73 | " \n", 74 | " \n", 75 | " cluster_assignment = np.argmin(distances_from_centroids,axis=1)\n", 76 | " \n", 77 | " return cluster_assignment" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "In pseudocode, we iteratively do the following:\n", 85 | "```\n", 86 | "cluster_assignment = assign_clusters(data, centroids)\n", 87 | "centroids = revise_centroids(data, k, cluster_assignment)\n", 88 | "```" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "### Assigning clusters" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "data = np.array([[1., 2., 0.],\n", 107 | " [0., 0., 0.],\n", 108 | " [2., 2., 0.]])\n", 109 | "centroids = np.array([[0.5, 0.5, 0.],\n", 110 | " [0., -0.5, 0.]])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Let's assign these data points to the closest centroid." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "[0 1 0]\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "cluster_assignment = assign_clusters(data, centroids)\n", 137 | "print (cluster_assignment)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 6, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "def revise_centroids(data, k, cluster_assignment):\n", 149 | " new_centroids = []\n", 150 | " for i in range(k):\n", 151 | " \n", 152 | " member_data_points = data.loc[data.cluster_assignment==i,:]\n", 153 | " \n", 154 | " centroid = np.mean(member_data_points)\n", 155 | " \n", 156 | " \n", 157 | " centroid = centroid.A1\n", 158 | " new_centroids.append(centroid)\n", 159 | " new_centroids = np.array(new_centroids)\n", 160 | " \n", 161 | " return new_centroids" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "**The Basic objective of k-means is to reduce overall euclidean distance inside the cluster **" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 7, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "def compute_heterogeneity(data, k, centroids, cluster_assignment):\n", 180 | " \n", 181 | " heterogeneity = 0.0\n", 182 | " for i in range(k):\n", 183 | " \n", 184 | " \n", 185 | " member_data_points = data[cluster_assignment==i, :]\n", 186 | " \n", 187 | " if member_data_points.shape[0] > 0: \n", 188 | " \n", 189 | " distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')\n", 190 | " squared_distances = distances**2\n", 191 | " heterogeneity += np.sum(squared_distances)\n", 192 | " \n", 193 | " return heterogeneity" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "Let's apply the above algorithm in our GPS data. **Note :We can't use euclidean distance here **" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### Loading data\n", 208 | "** Here we will load only 10,000 rows of our 'all.csv' file After doing this we will generalize our result to whole data**" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 9, 214 | "metadata": { 215 | "collapsed": false 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "data = pd.read_csv('all.csv',nrows=10000)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "## Finding the number of cluster " 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "**If a user is spending matore than 10 minutes at a particular location . That location is assigned as centroid **" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 10, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/html": [ 246 | "
| \n", 251 | " | index | \n", 252 | "lat | \n", 253 | "long | \n", 254 | "altitude | \n", 255 | "trajectory_id | \n", 256 | "subfolder | \n", 257 | "labels | \n", 258 | "datetime | \n", 259 | "distance | \n", 260 | "timedelta | \n", 261 | "velocity | \n", 262 | "acceleration | \n", 263 | "
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", 268 | "0 | \n", 269 | "39.984702 | \n", 270 | "116.318417 | \n", 271 | "492.0 | \n", 272 | "20081023025304 | \n", 273 | "0 | \n", 274 | "NaN | \n", 275 | "2008-10-23 02:53:04 | \n", 276 | "3.520694 | \n", 277 | "0 days 00:00:06.000000000 | \n", 278 | "0.586782 | \n", 279 | "-0.003189 | \n", 280 | "
| 1 | \n", 283 | "1 | \n", 284 | "39.984683 | \n", 285 | "116.318450 | \n", 286 | "492.0 | \n", 287 | "20081023025304 | \n", 288 | "0 | \n", 289 | "NaN | \n", 290 | "2008-10-23 02:53:10 | \n", 291 | "2.838241 | \n", 292 | "0 days 00:00:05.000000000 | \n", 293 | "0.567648 | \n", 294 | "-0.003841 | \n", 295 | "
| 2 | \n", 298 | "2 | \n", 299 | "39.984686 | \n", 300 | "116.318417 | \n", 301 | "492.0 | \n", 302 | "20081023025304 | \n", 303 | "0 | \n", 304 | "NaN | \n", 305 | "2008-10-23 02:53:15 | \n", 306 | "2.742220 | \n", 307 | "0 days 00:00:05.000000000 | \n", 308 | "0.548444 | \n", 309 | "0.332144 | \n", 310 | "
| 3 | \n", 313 | "3 | \n", 314 | "39.984688 | \n", 315 | "116.318385 | \n", 316 | "492.0 | \n", 317 | "20081023025304 | \n", 318 | "0 | \n", 319 | "NaN | \n", 320 | "2008-10-23 02:53:20 | \n", 321 | "11.045822 | \n", 322 | "0 days 00:00:05.000000000 | \n", 323 | "2.209164 | \n", 324 | "0.391130 | \n", 325 | "
| 4 | \n", 328 | "4 | \n", 329 | "39.984655 | \n", 330 | "116.318263 | \n", 331 | "492.0 | \n", 332 | "20081023025304 | \n", 333 | "0 | \n", 334 | "NaN | \n", 335 | "2008-10-23 02:53:25 | \n", 336 | "20.824082 | \n", 337 | "0 days 00:00:05.000000000 | \n", 338 | "4.164816 | \n", 339 | "0.072513 | \n", 340 | "