├── figure ├── 01.png ├── 02.png ├── 03.png ├── 04.png ├── m1.png ├── t1.png ├── t2.png ├── t3.png ├── alg1.png ├── alg2.png ├── alg3.png └── system.png ├── code ├── examples_bayes │ ├── example_other │ │ ├── stan-reference-2.6.0.pdf │ │ ├── test_emcee.py │ │ └── test_pystan.py │ ├── example_pymc │ │ ├── ldamc.py │ │ └── disaster_model.py │ └── bayes_model.py ├── examples_sklearn │ ├── plot_gmm_pdf.py │ ├── plot_gmm.py │ ├── plot_gmm_sin.py │ ├── plot_gmm_selection.py │ ├── plot_gmm_classifier.py │ └── sk_gmm.py ├── extract_trace.py ├── model_baseline1.py ├── stationary_detection.py ├── model_baseline2.py ├── stationary_segmentation.py └── model_combine.py └── README.md /figure/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/01.png -------------------------------------------------------------------------------- /figure/02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/02.png -------------------------------------------------------------------------------- /figure/03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/03.png -------------------------------------------------------------------------------- /figure/04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/04.png -------------------------------------------------------------------------------- /figure/m1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/m1.png -------------------------------------------------------------------------------- /figure/t1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/t1.png -------------------------------------------------------------------------------- /figure/t2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/t2.png -------------------------------------------------------------------------------- /figure/t3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/t3.png -------------------------------------------------------------------------------- /figure/alg1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/alg1.png -------------------------------------------------------------------------------- /figure/alg2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/alg2.png -------------------------------------------------------------------------------- /figure/alg3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/alg3.png -------------------------------------------------------------------------------- /figure/system.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/system.png -------------------------------------------------------------------------------- /code/examples_bayes/example_other/stan-reference-2.6.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/code/examples_bayes/example_other/stan-reference-2.6.0.pdf -------------------------------------------------------------------------------- /code/examples_bayes/example_other/test_emcee.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import emcee 3 | 4 | def lnprob(x, ivar): 5 | return -0.5 * np.sum(ivar * x ** 2) 6 | 7 | ndim, nwalkers = 10, 100 8 | ivar = 1. / np.random.rand(ndim) 9 | p0 = [np.random.rand(ndim) for i in range(nwalkers)] 10 | sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=[ivar]) 11 | sampler.run_mcmc(p0, 1000) 12 | -------------------------------------------------------------------------------- /code/examples_bayes/example_other/test_pystan.py: -------------------------------------------------------------------------------- 1 | # import numpy as np 2 | # import emcee 3 | 4 | # def lnprob(x, ivar): 5 | # return -0.5 * np.sum(ivar * x ** 2) 6 | 7 | # ndim, nwalkers = 10, 100 8 | # ivar = 1. / np.random.rand(ndim) 9 | # p0 = [np.random.rand(ndim) for i in range(nwalkers)] 10 | # sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=[ivar]) 11 | # sampler.run_mcmc(p0, 1000) 12 | 13 | import pystan 14 | 15 | schools_code = """ 16 | data { 17 | int J; // number of schools 18 | real y[J]; // estimated treatment effects 19 | real sigma[J]; // s.e. of effect estimates 20 | } 21 | parameters { 22 | real mu; 23 | real tau; 24 | real eta[J]; 25 | } 26 | transformed parameters { 27 | real theta[J]; 28 | for (j in 1:J) 29 | theta[j] <- mu + tau * eta[j]; 30 | } 31 | model { 32 | eta ~ normal(0, 1); 33 | y ~ normal(theta, sigma); 34 | } 35 | """ 36 | 37 | schools_dat = {'J': 8, 38 | 'y': [28, 8, -3, 7, -1, 1, 18, 12], 39 | 'sigma': [15, 10, 16, 11, 9, 11, 10, 18]} 40 | 41 | fit = pystan.stan(model_code=schools_code, data=schools_dat, iter=1000, chains=4) 42 | -------------------------------------------------------------------------------- /code/examples_bayes/example_pymc/ldamc.py: -------------------------------------------------------------------------------- 1 | import time 2 | import fileinput 3 | import pymc as pm 4 | import numpy as np 5 | 6 | st = time.time() 7 | K = 2 # number of topics 8 | V = 3 # number of words 9 | D = 3 # number of documents 10 | data = np.array([[1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], [2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) 11 | alpha = np.ones(K) 12 | beta = np.ones(V) 13 | theta = pm.Container([pm.CompletedDirichlet("theta_%s" % i, pm.Dirichlet("ptheta_%s" % i, theta=alpha)) for i in range(D)]) 14 | phi = pm.Container([pm.CompletedDirichlet("phi_%s" % k, pm.Dirichlet("pphi_%s" % k, theta=beta)) for k in range(K)]) 15 | Wd = [len(doc) for doc in data] 16 | z = pm.Container([pm.Categorical('z_%i' % d, p=theta[d], size=Wd[d], value=np.random.randint(K,size=Wd[d])) for d in range(D)]) 17 | w = pm.Container([pm.Categorical("w_%i_%i" % (d,i), p=pm.Lambda('phi_z_%i_%i' % (d,i), lambda z=z[d][i], phi=phi:phi[z]), value=data[d][i], observed=True) for d in range(D) for i in range(Wd[d])]) 18 | model = pm.Model([theta, phi, z, w]) 19 | mcmc = pm.MCMC(model) 20 | mcmc.sample(1000) 21 | ft = time.time() 22 | print ft-st 23 | print theta.value 24 | print phi.value -------------------------------------------------------------------------------- /code/examples_sklearn/plot_gmm_pdf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib.colors import LogNorm 4 | from sklearn import mixture 5 | 6 | n_samples = 300 7 | # generate random sample, two components 8 | np.random.seed(0) 9 | # generate spherical data centered on (20, 20) 10 | shifted_gaussian = np.random.randn(n_samples, 2) + np.array([20, 20]) 11 | # generate zero centered stretched Gaussian data 12 | C = np.array([[0., -0.7], [3.5, .7]]) 13 | stretched_gaussian = np.dot(np.random.randn(n_samples, 2), C) 14 | # concatenate the two datasets into the final training set 15 | X_train = np.vstack([shifted_gaussian, stretched_gaussian]) 16 | # fit a Gaussian Mixture Model with two components 17 | clf = mixture.GMM(n_components=2, covariance_type='full') 18 | clf.fit(X_train) 19 | 20 | # display predicted scores by the model as a contour plot 21 | x = np.linspace(-20.0, 30.0) 22 | y = np.linspace(-20.0, 40.0) 23 | X, Y = np.meshgrid(x, y) 24 | XX = np.array([X.ravel(), Y.ravel()]).T 25 | Z = -clf.score_samples(XX)[0] 26 | Z = Z.reshape(X.shape) 27 | 28 | CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0), levels=np.logspace(0, 3, 10)) 29 | CB = plt.colorbar(CS, shrink=0.8, extend='both') 30 | plt.scatter(X_train[:, 0], X_train[:, 1], .8) 31 | 32 | plt.title('Negative log-likelihood predicted by a GMM') 33 | plt.axis('tight') 34 | plt.show() 35 | -------------------------------------------------------------------------------- /code/examples_bayes/example_pymc/disaster_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | A model for the disasters data with a changepoint 3 | 4 | changepoint ~ U(0, 110) 5 | early_mean ~ Exp(1.) 6 | late_mean ~ Exp(1.) 7 | disasters[t] ~ Po(early_mean if t <= switchpoint, late_mean otherwise) 8 | 9 | """ 10 | 11 | from pymc import * 12 | from numpy import array, empty 13 | from numpy.random import randint 14 | 15 | disasters_array = array([4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6, 16 | 3, 3, 5, 4, 5, 3, 1, 4, 4, 1, 5, 5, 3, 4, 2, 5, 17 | 2, 2, 3, 4, 2, 1, 3, 2, 2, 1, 1, 1, 1, 3, 0, 0, 18 | 1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1, 19 | 0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2, 20 | 3, 3, 1, 1, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4, 21 | 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1]) 22 | 23 | # Define data and stochastics 24 | switchpoint = DiscreteUniform( 25 | 'switchpoint', 26 | lower=0, 27 | upper=110, 28 | doc='Switchpoint[year]') 29 | 30 | early_mean = Exponential('early_mean', beta=1.) 31 | late_mean = Exponential('late_mean', beta=1.) 32 | 33 | @deterministic(plot=False) 34 | def rate(s=switchpoint, e=early_mean, l=late_mean): 35 | ''' Concatenate Poisson means ''' 36 | out = empty(len(disasters_array)) 37 | out[:s] = e 38 | out[s:] = l 39 | return out 40 | 41 | disasters = Poisson('disasters', mu=rate, value=disasters_array, observed=True) 42 | 43 | # import disaster_model 44 | from pymc import MCMC 45 | # M = MCMC(disaster_model) 46 | M = MCMC([switchpoint,early_mean,late_mean,rate,disasters]) 47 | M.sample(iter=10000, burn=1000, thin=10) 48 | print switchpoint.value 49 | print rate.value 50 | print M.trace('switchpoint')[:] 51 | # from pymc.Matplot import plot 52 | # plot(M) 53 | -------------------------------------------------------------------------------- /code/examples_sklearn/plot_gmm.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | from scipy import linalg 4 | import matplotlib.pyplot as plt 5 | import matplotlib as mpl 6 | from sklearn import mixture 7 | 8 | # Number of samples per component 9 | n_samples = 500 10 | # Generate random sample, two components 11 | np.random.seed(0) 12 | C = np.array([[0., -0.1], [1.7, .4]]) 13 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] 14 | # Fit a mixture of Gaussians with EM using five components 15 | gmm = mixture.GMM(n_components=5, covariance_type='full') 16 | gmm.fit(X) 17 | # Fit a Dirichlet process mixture of Gaussians using five components 18 | dpgmm = mixture.DPGMM(n_components=5, covariance_type='full') 19 | dpgmm.fit(X) 20 | 21 | color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) 22 | for i, (clf, title) in enumerate([(gmm, 'GMM'), (dpgmm, 'Dirichlet Process GMM')]): 23 | splot = plt.subplot(2, 1, 1 + i) 24 | Y_ = clf.predict(X) 25 | for i, (mean, covar, color) in enumerate(zip(clf.means_, clf._get_covars(), color_iter)): 26 | v, w = linalg.eigh(covar) 27 | u = w[0] / linalg.norm(w[0]) 28 | # as the DP will not use every component it has access to unless it needs it, we shouldn't plot the redundant components. 29 | if not np.any(Y_ == i): 30 | continue 31 | plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) 32 | # Plot an ellipse to show the Gaussian component 33 | angle = np.arctan(u[1] / u[0]) 34 | angle = 180 * angle / np.pi # convert to degrees 35 | ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color) 36 | ell.set_clip_box(splot.bbox) 37 | ell.set_alpha(0.5) 38 | splot.add_artist(ell) 39 | plt.xlim(-10, 10) 40 | plt.ylim(-3, 6) 41 | plt.xticks(()) 42 | plt.yticks(()) 43 | plt.title(title) 44 | 45 | plt.show() 46 | -------------------------------------------------------------------------------- /code/extract_trace.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | from operator import add 5 | from pyspark import SparkConf 6 | from pyspark import SparkContext 7 | 8 | def extract(line): 9 | import time 10 | try: 11 | part = line.strip().replace('\"','').split(",") 12 | TTIME, LAC, CI, IMSI = part[1].split(" "), part[3], part[4], part[5] 13 | pt1, pt2, pt3 = TTIME[0].split("-"), TTIME[1].split("."), TTIME[2] 14 | year, month, day, hour, minute, second = int("20"+pt1[2]), {"AUG":8}[pt1[1]], int(pt1[0]), int(pt2[0]), int(pt2[1]), int(pt2[2]) 15 | hour = hour if hour != 12 else 0 16 | hour = hour if pt3 == "AM" else hour+12 17 | secs = hour*3600+minute*60+second 18 | key = LAC+" "+CI 19 | sl = secs/(10*60) 20 | if bss.has_key(key): 21 | bs = bss[key] 22 | lng, lat = bs["lng"], bs["lat"] 23 | if 120.02<=lng<120.48 and 30.15<=lat<=30.42: 24 | gx, gy = int((lng-120.02)/(120.48-120.02)*225), int((lat-30.15)/(30.42-30.15)*150) 25 | return ((IMSI, sl), str(gx)+","+str(gy)) 26 | else: 27 | return (("", -1), "") 28 | else: 29 | return (("", -1), "") 30 | except: 31 | return (("", -1), "") 32 | 33 | global bss 34 | 35 | if __name__ == "__main__": 36 | import fileinput 37 | bss = {} 38 | for line in fileinput.input("hz_base.txt"): 39 | part = line.strip().split(" ") 40 | num, lng, lat = part[1]+" "+part[2], float(part[3]), float(part[4]) 41 | bss[num] = {"lng":lng, "lat":lat} 42 | fileinput.close() 43 | conf = SparkConf().setMaster('yarn-client') \ 44 | .setAppName('qiangsiwei') \ 45 | .set('spark.driver.maxResultSize', "8g") 46 | sc = SparkContext(conf = conf) 47 | filename = "0826" 48 | lines = sc.textFile("hdfs://namenode.omnilab.sjtu.edu.cn/user/qiangsiwei/hangzhou/original/{0}.csv".format(filename), 1) 49 | counts = lines.map(lambda x : extract(x)) \ 50 | .filter(lambda x : x[0][0]!="" and x[0][1]!=-1 and x[1]!="") \ 51 | .distinct() \ 52 | .groupByKey() \ 53 | .map(lambda x : (x[0][0],str(x[0][1])+":"+"-".join(sorted(x[1])))) \ 54 | .groupByKey() \ 55 | .map(lambda x : x[0]+"\t"+"|".join([str(it["sl"])+":"+it["gs"] for it in sorted([{"sl":int(line.split(":")[0]),"gs":line.split(":")[1]} for line in x[1]], key=lambda x:x["sl"])])) 56 | output = counts.saveAsTextFile("./hangzhou/SSTD/3G/{0}.csv".format(filename)) 57 | -------------------------------------------------------------------------------- /code/examples_sklearn/plot_gmm_sin.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | from scipy import linalg 4 | import matplotlib.pyplot as plt 5 | import matplotlib as mpl 6 | from sklearn import mixture 7 | from sklearn.externals.six.moves import xrange 8 | 9 | # Number of samples per component 10 | n_samples = 100 11 | # Generate random sample following a sine curve 12 | np.random.seed(0) 13 | X = np.zeros((n_samples, 2)) 14 | step = 4 * np.pi / n_samples 15 | 16 | for i in xrange(X.shape[0]): 17 | x = i * step - 6 18 | X[i, 0] = x + np.random.normal(0, 0.1) 19 | X[i, 1] = 3 * (np.sin(x) + np.random.normal(0, .2)) 20 | color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm']) 21 | 22 | # print X 23 | # # est = mixture.GMM(n_components=10, covariance_type='full', n_iter=100) 24 | # est = mixture.DPGMM(n_components=10, covariance_type='spherical', alpha=100., n_iter=100) 25 | # est.fit(X) 26 | # print est.means_ 27 | 28 | for i, (clf, title) in enumerate([ 29 | (mixture.GMM(n_components=10, covariance_type='spherical', n_iter=100), "Expectation-maximization"), 30 | (mixture.GMM(n_components=10, covariance_type='diag', n_iter=100), "Expectation-maximization"), 31 | (mixture.GMM(n_components=10, covariance_type='tied', n_iter=100), "Expectation-maximization"), 32 | (mixture.GMM(n_components=10, covariance_type='full', n_iter=100), "Expectation-maximization")]): 33 | clf.fit(X) 34 | splot = plt.subplot(4, 1, 1 + i) 35 | Y_ = clf.predict(X) 36 | for i, (mean, covar, color) in enumerate(zip(clf.means_, clf._get_covars(), color_iter)): 37 | print mean 38 | v, w = linalg.eigh(covar) 39 | u = w[0] / linalg.norm(w[0]) 40 | # as the DP will not use every component it has access to unless it needs it, we shouldn't plot the redundant components. 41 | if not np.any(Y_ == i): 42 | continue 43 | plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) 44 | # Plot an ellipse to show the Gaussian component 45 | angle = np.arctan(u[1] / u[0]) 46 | angle = 180 * angle / np.pi # convert to degrees 47 | ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color) 48 | ell.set_clip_box(splot.bbox) 49 | ell.set_alpha(0.5) 50 | splot.add_artist(ell) 51 | plt.xlim(-6, 4 * np.pi - 6) 52 | plt.ylim(-5, 5) 53 | plt.title(title) 54 | plt.xticks(()) 55 | plt.yticks(()) 56 | print "----- ----- -----" 57 | 58 | plt.show() 59 | -------------------------------------------------------------------------------- /code/examples_sklearn/plot_gmm_selection.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | from scipy import linalg 4 | import matplotlib.pyplot as plt 5 | import matplotlib as mpl 6 | from sklearn import mixture 7 | 8 | # Number of samples per component 9 | n_samples = 500 10 | # Generate random sample, two components 11 | np.random.seed(0) 12 | C = np.array([[0., -0.1], [1.7, .4]]) 13 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] 14 | 15 | lowest_bic, bic = np.infty, [] 16 | n_components_range = range(1, 7) 17 | cv_types = ['spherical', 'tied', 'diag', 'full'] 18 | for cv_type in cv_types: 19 | for n_components in n_components_range: 20 | # Fit a mixture of Gaussians with EM 21 | gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type) 22 | gmm.fit(X) 23 | bic.append(gmm.bic(X)) 24 | if bic[-1] < lowest_bic: 25 | lowest_bic = bic[-1] 26 | best_gmm = gmm 27 | 28 | bic = np.array(bic) 29 | color_iter = itertools.cycle(['k', 'r', 'g', 'b', 'c', 'm', 'y']) 30 | clf, bars = best_gmm, [] 31 | 32 | # Plot the BIC scores 33 | spl = plt.subplot(2, 1, 1) 34 | for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)): 35 | xpos = np.array(n_components_range) + .2 * (i - 2) 36 | bars.append(plt.bar(xpos, bic[i * len(n_components_range):(i + 1) * len(n_components_range)], width=.2, color=color)) 37 | plt.xticks(n_components_range) 38 | plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()]) 39 | plt.title('BIC score per model') 40 | xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 + .2 * np.floor(bic.argmin() / len(n_components_range)) 41 | plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14) 42 | spl.set_xlabel('Number of components') 43 | spl.legend([b[0] for b in bars], cv_types) 44 | 45 | # Plot the winner 46 | splot = plt.subplot(2, 1, 2) 47 | Y_ = clf.predict(X) 48 | for i, (mean, covar, color) in enumerate(zip(clf.means_, clf.covars_, color_iter)): 49 | v, w = linalg.eigh(covar) 50 | if not np.any(Y_ == i): 51 | continue 52 | plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) 53 | # Plot an ellipse to show the Gaussian component 54 | angle = np.arctan2(w[0][1], w[0][0]) 55 | angle = 180 * angle / np.pi # convert to degrees 56 | v *= 4 57 | ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color) 58 | ell.set_clip_box(splot.bbox) 59 | ell.set_alpha(.5) 60 | splot.add_artist(ell) 61 | 62 | plt.xlim(-10, 10) 63 | plt.ylim(-3, 6) 64 | plt.xticks(()) 65 | plt.yticks(()) 66 | plt.title('Selected GMM: full model, 2 components') 67 | plt.subplots_adjust(hspace=.35, bottom=.02) 68 | plt.show() 69 | -------------------------------------------------------------------------------- /code/examples_sklearn/plot_gmm_classifier.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib as mpl 3 | import numpy as np 4 | 5 | from sklearn import datasets 6 | from sklearn.cross_validation import StratifiedKFold 7 | from sklearn.externals.six.moves import xrange 8 | from sklearn.mixture import GMM 9 | 10 | def make_ellipses(gmm, ax): 11 | for n, color in enumerate('rgb'): 12 | v, w = np.linalg.eigh(gmm._get_covars()[n][:2, :2]) 13 | u = w[0] / np.linalg.norm(w[0]) 14 | angle = np.arctan2(u[1], u[0]) 15 | angle = 180 * angle / np.pi 16 | v *= 9 17 | ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1], 180 + angle, color=color) 18 | ell.set_clip_box(ax.bbox) 19 | ell.set_alpha(0.5) 20 | ax.add_artist(ell) 21 | 22 | iris = datasets.load_iris() 23 | skf = StratifiedKFold(iris.target, n_folds=4) 24 | train_index, test_index = next(iter(skf)) 25 | 26 | X_train = iris.data[train_index] 27 | y_train = iris.target[train_index] 28 | X_test = iris.data[test_index] 29 | y_test = iris.target[test_index] 30 | n_classes = len(np.unique(y_train)) 31 | 32 | # Try GMMs using different types of covariances. 33 | classifiers = dict((covar_type, GMM(n_components=n_classes, covariance_type=covar_type, init_params='wc', n_iter=20)) for covar_type in ['spherical', 'diag', 'tied', 'full']) 34 | n_classifiers = len(classifiers) 35 | plt.figure(figsize=(3 * n_classifiers / 2, 6)) 36 | plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99) 37 | 38 | for index, (name, classifier) in enumerate(classifiers.items()): 39 | classifier.means_ = np.array([X_train[y_train == i].mean(axis=0) for i in xrange(n_classes)]) 40 | classifier.fit(X_train) 41 | h = plt.subplot(2, n_classifiers / 2, index + 1) 42 | make_ellipses(classifier, h) 43 | for n, color in enumerate('rgb'): 44 | data = iris.data[iris.target == n] 45 | plt.scatter(data[:, 0], data[:, 1], 0.8, color=color, label=iris.target_names[n]) 46 | for n, color in enumerate('rgb'): 47 | data = X_test[y_test == n] 48 | plt.plot(data[:, 0], data[:, 1], 'x', color=color) 49 | y_train_pred = classifier.predict(X_train) 50 | train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100 51 | plt.text(0.05, 0.9, 'Train accuracy: %.1f' % train_accuracy, transform=h.transAxes) 52 | y_test_pred = classifier.predict(X_test) 53 | test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100 54 | plt.text(0.05, 0.8, 'Test accuracy: %.1f' % test_accuracy, transform=h.transAxes) 55 | plt.xticks(()) 56 | plt.yticks(()) 57 | plt.title(name) 58 | 59 | plt.legend(loc='lower right', prop=dict(size=12)) 60 | plt.show() 61 | -------------------------------------------------------------------------------- /code/model_baseline1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import glob 4 | import math 5 | import json 6 | import random 7 | import fileinput 8 | 9 | # 时间粒度为10分钟 10 | # 空间粒度为200米 11 | # 20-24为工作日 12 | 13 | def euclidean(p1, p2): 14 | return ((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)**0.5 15 | 16 | def gauss(y, mu, sigma): 17 | # print "gauss computation:", y, mu, sigma 18 | return 1./math.sqrt((sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0]))*math.exp(-0.5/(sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0])*(sigma[0][0]*(y[0]-mu[0])**2-(y[0]-mu[0])*(y[1]-mu[1])*(sigma[0][1]+sigma[1][0])+sigma[1][1]*(y[1]-mu[1])**2)) 19 | # print gauss([1,1],[0,0],[[1,0],[0,1]]) 20 | 21 | K, R, data = 5, 20, [] 22 | 23 | # GMM 24 | def run_baseline1(): 25 | import numpy as np 26 | from sklearn import mixture 27 | 28 | for line in fileinput.input("../data/stationary.txt"): 29 | st, ft, gx, gy = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]), int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6]) 30 | data.append([st,ft,gx,gy]) 31 | fileinput.close() 32 | 33 | likelihood, param = 0, {} 34 | gmm_temporal = mixture.GMM(covariance_type="full",n_components=K) 35 | gmm_temporal.fit(np.array([sample[:2] for sample in data])) 36 | data_spatial = [[] for k in xrange(K)] 37 | param['gmm_temporal_weights'] = gmm_temporal.weights_.tolist() 38 | param['gmm_temporal_means'] = gmm_temporal.means_.tolist() 39 | param['gmm_temporal_covars'] = gmm_temporal.covars_.tolist() 40 | param['gmm_spatial'] = [] 41 | for i, k in enumerate(gmm_temporal.predict([sample[:2] for sample in data])): 42 | data_spatial[k].append(data[i]) 43 | for k in xrange(K): 44 | print '-'*10, k, '-'*10 45 | gmm_spatial = mixture.GMM(covariance_type="full",n_components=R) 46 | gmm_spatial.fit(np.array([sample[2:] for sample in data_spatial[k]])) 47 | param['gmm_spatial'].append({ 48 | 'weights': gmm_spatial.weights_.tolist(), 49 | 'means': gmm_spatial.means_.tolist(), 50 | 'covars': gmm_spatial.covars_.tolist() 51 | }) 52 | for j, r in enumerate(gmm_spatial.predict([sample[2:] for sample in data_spatial[k]])): 53 | prob = 1.*gmm_temporal.weights_[k]*gmm_spatial.weights_[r]*\ 54 | gauss(data_spatial[k][j][:2],gmm_temporal.means_[k],gmm_temporal.covars_[k])*\ 55 | gauss(data_spatial[k][j][2:],gmm_spatial.means_[r],gmm_spatial.covars_[r]) 56 | likelihood += -math.log10(prob) 57 | print likelihood 58 | 59 | with open('model_save/baseline1.txt','w') as f: 60 | f.write(json.dumps(param)) 61 | 62 | 63 | def compute_error(): 64 | import numpy as np 65 | 66 | param = json.loads(open('model_save/baseline1.txt','r').read()) 67 | gmm_temporal_weights = param['gmm_temporal_weights'] 68 | gmm_temporal_means = param['gmm_temporal_means'] 69 | gmm_temporal_covars = param['gmm_temporal_covars'] 70 | gmm_spatial = param['gmm_spatial'] 71 | 72 | # 时间分布 73 | matrix1 = [[0 for j in xrange(24*6)] for i in xrange(24*6)] 74 | for line in fileinput.input("../data/stationary.txt"): 75 | st, ft = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]) 76 | matrix1[st][ft] += 1 77 | fileinput.close() 78 | matrix2 = [[0 for j in xrange(24*6)] for i in xrange(24*6)] 79 | for k in xrange(K): 80 | for st in xrange(24*6): 81 | for ft in xrange(24*6): 82 | matrix2[st][ft] += 1.*gmm_temporal_weights[k]*gauss([st,ft],gmm_temporal_means[k],gmm_temporal_covars[k]) 83 | matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum() 84 | matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum() 85 | print "Temporal reconstruction accuracy:", 1-(abs(matrix1-matrix2)[40:60,100:140].sum()+abs(matrix1-matrix2)[100:140,40:60].sum()) 86 | 87 | # 空间分布 88 | matrix1 = [[0 for j in xrange(150)] for i in xrange(225)] 89 | for line in fileinput.input("../data/stationary.txt"): 90 | gx, gy = int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6]) 91 | matrix1[gx][gy] += 1 92 | fileinput.close() 93 | matrix2 = [[0 for j in xrange(150)] for i in xrange(225)] 94 | for k in xrange(K): 95 | for r in xrange(R): 96 | for gx in xrange(225): 97 | for gy in xrange(150): 98 | matrix2[gx][gy] += 1.*gmm_temporal_weights[k]*gmm_spatial[k]['weights'][r]*gauss([gx,gy],gmm_spatial[k]['means'][r],gmm_spatial[k]['covars'][r]) 99 | matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum() 100 | matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum() 101 | print "Spatial reconstruction accuracy:", 1-abs(matrix1-matrix2)[50:90,50:90].sum() 102 | 103 | 104 | if __name__ == "__main__": 105 | # run_baseline1() 106 | compute_error() 107 | -------------------------------------------------------------------------------- /code/stationary_detection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import gzip 4 | import fileinput 5 | import numpy as np 6 | from pylab import * 7 | 8 | # 时间粒度为10分钟 9 | # 空间粒度为200米 10 | # 20-24为工作日 11 | 12 | def euclidean(p1, p2): 13 | return 200*((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)**0.5 14 | 15 | # 轨迹连接 16 | def trajectory_concat(): 17 | users = {} 18 | for day, filename in enumerate(['0820','0821','0822','0823','0824']): 19 | print filename 20 | for line in gzip.open("../data/3G/{0}.txt.gz".format(filename)): 21 | uid, slices = line.strip().split("\t") 22 | slices = ["{0}:{1}".format(int(sl.split(":")[0])+day*6*24,sl) 23 | for sl in slices.split("|")] 24 | users[uid] = users.get(uid,[]) 25 | users[uid].extend(slices) 26 | with open("../data/trace.txt","w") as f: 27 | for uid, slices in users.iteritems(): 28 | f.write("{0}\t{1}\n".format(uid,"|".join(slices))) 29 | 30 | # 阈值距离为1000米,时间为1小时,仅筛选出工作日 31 | def stationary_detection(): 32 | min_distance, min_duration, min_session = 1000, 1*60/10, 10 33 | with open("../data/stationary.txt", "w") as f: 34 | # line_num = 0 35 | for line in gzip.open("../data/trace.txt.gz"): 36 | # line_num += 1 37 | # print line_num 38 | uid = line.strip().split("\t")[0] 39 | session_list, session_current, slices = [], [], [(int(sl.split(":")[0]), \ 40 | int(sl.split(":")[1]), \ 41 | sum([int(p.split(",")[0]) for p in sl.split(":")[2].split("-")])/len(sl.split(":")[2].split("-")), \ 42 | sum([int(p.split(",")[1]) for p in sl.split(":")[2].split("-")])/len(sl.split(":")[2].split("-"))) \ 43 | for sl in line.strip().split("\t")[1].split("|")] 44 | for sl in slices: 45 | if len(session_current) == 0: 46 | session_current = [sl] 47 | else: 48 | if euclidean(sl[2:],session_current[-1][2:]) >= min_distance: 49 | if session_current[-1][0]-session_current[0][0] >= min_duration: 50 | session_list.append(session_current) 51 | session_current = [sl] 52 | else: 53 | session_current.append(sl) 54 | if session_current[-1][0]-session_current[0][0] >= min_duration: 55 | session_list.append(session_current) 56 | if len(session_list) >= min_session: 57 | for i in range(1,len(session_list)-1): 58 | if len(session_list[i]) >= 2 and 1*24*60/10 < session_list[i][-1][0] and session_list[i][0][0] < 6*24*60/10: 59 | f.write(uid+" "+str(round(float(session_list[i][0][0]%(24*60/10))/(60/10),2))+" "+\ 60 | str(round(float(session_list[i][-1][0]%(24*60/10))/(60/10),2))+" "+\ 61 | str(session_list[i][0][0]%(24*60/10))+" "+\ 62 | str(session_list[i][-1][0]%(24*60/10))+" "+\ 63 | str(sum([session[2] for session in session_list[i]])/len(session_list[i]))+" "+\ 64 | str(sum([session[3] for session in session_list[i]])/len(session_list[i]))+"\n") 65 | 66 | def stationary_statistic(): 67 | matrix = [[0 for j in xrange(24*6)] for i in xrange(24*6)] 68 | for line in fileinput.input("../data/stationary.txt"): 69 | matrix[int(line.strip().split(" ")[3])][int(line.strip().split(" ")[4])] += 1 70 | fileinput.close() 71 | (X, Y), C = meshgrid(np.arange(24*6), np.arange(24*6)), np.array(matrix) 72 | # 时间分布 73 | subplot(1,1,1) 74 | cset = pcolormesh(X, Y, C.T, cmap=cm.get_cmap("OrRd")) 75 | plt.axis([0, 24*6-1, 0, 24*6-1]) 76 | colorbar(cset) 77 | plt.xlabel('Session entering time slot /10min') 78 | plt.ylabel('Session leaving time slot /10min') 79 | # show() 80 | for postfix in ('eps','png'): 81 | savefig('../figure/{0}/01.{0}'.format(postfix)) 82 | 83 | matrix1, matrix2 = [[0 for j in xrange(150)] for i in xrange(225)], [[0 for j in xrange(150)] for i in xrange(225)] 84 | for line in fileinput.input("../data/stationary.txt"): 85 | ts, tf, gx, gy = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]), int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6]) 86 | d1, d2 = ((ts-50)**2+(tf-110)**2)**(1.0/2), ((ts-110)**2+(tf-50)**2)**(1.0/2) 87 | if d1 <= d2: 88 | matrix1[gx][gy] += 1 89 | else: 90 | matrix2[gx][gy] += 1 91 | fileinput.close() 92 | (X, Y), C1, C2 = meshgrid(np.arange(100), np.arange(100)), np.array(matrix1)[20:120,20:120], np.array(matrix2)[20:120,20:120] 93 | # 空间分布 94 | plt.figure(figsize=(12,5)) 95 | plt.subplots_adjust(left=0.05,right=1.00) 96 | subplot(1,2,1) 97 | cset1 = pcolormesh(X, Y, C1.T, cmap=cm.get_cmap("OrRd")) 98 | plt.axis([0, 100-1, 0, 100-1]) 99 | colorbar(cset1) 100 | plt.xlabel('Longitude grid index /200m') 101 | plt.ylabel('Latitude grid index /200m') 102 | plt.title('Diurnal') 103 | subplot(1,2,2) 104 | cset2 = pcolormesh(X, Y, C2.T, cmap=cm.get_cmap("OrRd")) 105 | plt.axis([0, 100-1, 0, 100-1]) 106 | colorbar(cset2) 107 | plt.title('Nocturnal') 108 | # show() 109 | for postfix in ('eps','png'): 110 | savefig('../figure/{0}/02.{0}'.format(postfix)) 111 | 112 | 113 | if __name__ == "__main__": 114 | # trajectory_concat() 115 | # stationary_detection() 116 | stationary_statistic() 117 | 118 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 群体驻留时空模式挖掘 2 | ============= 3 | 4 | 本项目所采用的数据采集于杭州移动3G网络。针对城市群体人口轨迹的挖掘,已有工作常聚焦于研究移动行为,例如通勤行为等,固然,城市通勤行为能够直接反应出城市动态特性,与城市区域功能、人类行为模式等密切相关,但另一方面,人口驻留行为,例如不同区域在单日或周内不同时间段内驻留的人口密度、停留时长、驻留时间段的起止时刻等都是研究城市区域功能以及人类行为模式的良好特征。城市大尺度下驻留模式研究一方面受限于数据源的限制,为了获取用户的驻留行为,一般需要获取用户完整的移动轨迹,常用的调查问卷、手机通话、车载GPS以及社交网络签到数据不能很好的满足需求,同时,庞大的城市居民数量对数据的存储和分析也提出了更好的要求。本项目针对群体驻留时空模式进行挖掘,首先从群体轨迹中提取出驻留片段,之后基于层级贝叶斯模型使用无监督聚类的方法自动发现城市人口的驻留模式,层级贝叶斯模型相对于已有的时空聚类方法,包括主成分分析、隐含主题模型等均具有一定优势。 5 | 6 | 数据采集与数据集 7 | ---- 8 | 9 | 本项目所采用的数据集包含一周内连续五天工作日的移动用户上网基站定位数据,采用用户识别码(IMSI)来区分不同用户,并将基站的位置区编码(LAC)联合小区标识(CI)同基站位置数据进行关联转换成为经纬度坐标,结合HTTP请求对应的时间戳即得到用户轨迹。经统计,数据集基本情况如表所示。 10 | 11 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/t1.png) 12 | 13 | 如图是群体驻留时空模式挖掘系统框架。主要包含数据准备、数据挖掘与分析等过程,数据准备包括移动网络日志清洗、基站经纬度映射、轨迹点提取、移动轨迹提取,数据挖掘与分析包括从移动轨迹中抽取驻留片段,并对驻留片段起止时刻进行估计,以及对驻留片段进行时空聚类,基于对时空模式的分析,能够进一步对功能区域进行推测与识别,或对用户轨迹进行语义标注,理解用户的出行目的。 14 | 15 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/system.png) 16 | 17 | 驻留轨迹段提取 18 | ---- 19 | 20 | 驻留轨迹段的提取分为两个步骤,首先抽取出用户停留的时间和地点(简称为驻留轨迹段提取),之后对用户的到达和离开时刻进行估计。驻留轨迹段的提取方法如算法所示,输入用户轨迹点,输出驻留片段,基于预设的时间间隔以及空间间隔阈值timeThres和distThres,按照时间序列依次计算出每个轨迹点与初始轨迹点之间的时间间隔和距离,直到首次距离超过空间间隔阈值distThres,此时如果时间间隔超过时间间隔阈值timeThres,则检测到一次驻留行为。 21 | 22 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/alg1.png) 23 | 24 | 在检测出驻留行为后,需要对驻留的起止时刻te和tl进行估计,减小时空模式挖掘结果的误差。本项目基于用户固有模式、区域固有模式、全局模式对起止时刻进行估计,一般而言,用户根据自身生活规律,例如固定的上、下班时间,位置迁移具有一定模式,单天观测时,用户轨迹点具有稀疏性,但针对不同天,随着用户上网需求变化,通常可以检测到较为连续的时间段内的用户位置的变化情况,因此,基于多天数据有助于对用户位置变化的时刻进行估计。同样,不同区域也具有其固有模式,例如同一公司员工上、下班时刻趋于一致。在更大的尺度上,即城市整体也具有较为固定的通勤模式,例如早、晚高峰。因此,本项目基于三种模式对用户驻留轨迹段的起止时刻采用加权最小二乘法进行估计。 25 | 26 | 如图是用户原始驻留轨迹段的示例与统计。如左图所示,是以10分钟为时间粒度的部分用户的轨迹段抽取结果,橙色和蓝色部分分别对应白天和夜间,由此可见,大多数用户都存在其固有模式,不同用户的状态转移也存在一定的全局相似性。如右图所示,是抽取出的驻留轨迹段之间时间间隔对应的累计概率分布函数,由此可见,大多数驻留轨迹段之间时间间隔较长,因此需要进行起止时刻估计。 27 | 28 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/01.png) 29 | 30 | 为了验证方法的有效性,共对比了两种基本方法,第一种是边值估计法,在提取出驻留轨迹后不做任何处理,第二种是中值估计,即假设用户离开地点A和到达地点B的时刻均为(t1+t2)/2,其中,t1、t2分别是最后一次观测到用户在地点A以及首次观测到用户在地点B的时刻。用于实验验证的数据来源于状态转移过程观测时间间隔较短,能够进行准确推测的部分数据。实验结果显示,本项目所提出的基于联合概率的估计方法具有最小的估计误差。 31 | 32 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/t2.png) 33 | 34 | 驻留行为时空聚类 35 | ---- 36 | 37 | 基于所提取出的群体驻留轨迹段,可以采用基于无监督的时空聚类的方法,自动发现城市人口的驻留行为模式。首先介绍本项目所提出的基于层级贝叶斯时空联合聚类模型的生成过程。基于之前提取出的群体驻留轨迹段,假设数据集对应的群体数量为N,每个用户包含的驻留轨迹片段数量为P,每个驻留轨迹段对应的隐含状态为s,不同的隐含状态可能携带了不同的语义信息,例如“工作”、“居家”、“娱乐”、“购物”等,每个驻留轨迹段的观测变量包括,驻留轨迹段的起止时刻te、tl以及空间位置l,由于空间位置与城市功能区域属性息息相关,因此临近的空间位置可能表达了同样的区域功能语义,反映了相似的用户出行的目的,因此可以假设实际观测到的空间位置l由该功能区域r的分布采样而来。 38 | 39 | 如图是本节模型的图表示(Graphical representation)。如图所示,深色节点表示观测变量或先验变量,浅色节点表示隐含变量。观测变量包括驻留轨迹段的起止时刻te、tl以及空间位置l,隐含变量包括轨迹段所对应的隐含状态s,空间位置l对应的功能区域r,并假设驻留轨迹段的起止时刻te、tl服从联合高斯分布,同时假设空间位置l服从混合高斯分布,混合高斯分布由多个高斯分布叠加而成,分量的选择服从多项分布,功能区域r可以视作分量的选择。 40 | 41 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/m1.png) 42 | 43 | 模型的生成过程如算法所示。 44 | 45 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/alg2.png) 46 | 47 | 在模型推理过程中采用折叠吉布斯采样(Collapsed Gibbs sampling)基于迭代过程对模型参数进行优化。吉布斯采样是一种马尔可夫链蒙特卡罗(Markov chain Monte Carlo/MCMC)算法,在基于指定多元概率分布直接进行采样较为困难时,可以通过采用吉布斯采样获得观测序列进行近似,经常用于贝叶斯推断(Bayesian inference)。折叠吉布斯采样在吉布斯采样的基础上,通过积分避开了实际待估计的参数,转而对隐含变量和观测变量进行采样,并通过积分在统计观测变量的取值频次后对实际待估计的参数进行估计。针对本项目模型中所含有的联合高斯分布和混合高斯分布,采用了折叠吉布斯采样和最大期望算法(Expectation Maximization Algorithm/EM)相互结合的方法对模型进行推理。最大期望算法是一种概率模型,能够通过最大似然估计或者最大后验概率估计对模型的参数进行优化,其优势在于能够对无法观测的隐藏变量(Latent variable)进行建模。最大期望算法基于迭代过程进行计算,每轮迭代主要包含两个步骤,即E步和M步,交替进行计算。在E步中,基于对隐藏变量上一轮迭代得到的估计值,计算出最大似然估计值。在M步中,通过求导数或偏导数的方法,求得最大化似然估计值时模型的参数。 48 | 49 | 模型的推断过程如算法所示。 50 | 51 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/alg3.png) 52 | 53 | 如图是针对提取出的驻留轨迹段(已经过起止时刻估计)的起止时刻的联合分布的统计结果,概率分布主要位于三个区域,即图的对角线以及关于对角线对称的两个区域,靠近对角线的左上角区域对应了较短的状态转移时间,囊括了各种情况下的短暂停留行为,对角线对称的两个区域所对应的起止时间趋近于上午9点至下午6点以及晚上7点至次日8点,分别对应了普遍的上班状态和居家状态,并且居家时间的开始时间更为分散,体现了不同群体行为的差异性。 54 | 55 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/02.png) 56 | 57 | 如图针对上图对角线对称的两个区域,统计出了对应的空间分布。如图所示,上班状态(Diurnal)和居家状态(Nocturnal)对应的驻留轨迹段的空间分布差异明显,上班状态对应的空间区域更倾向于城市的中心区域,而居家状态对应的空间区域在城市范围内较为分散。 58 | 59 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/03.png) 60 | 61 | 为了说明层级贝叶斯时空联合聚类模型的优越性,共对比了另外两种模型,其中一种是层级贝叶斯时空次序聚类模型,另一种是高斯混合时空次序聚类模型。高斯混合时空次序聚类模型首先基于驻留轨迹段的起止时刻联合分布进行聚类,之后针对时间维度聚类结果得到的每个类,再基于其空间分布进行聚类。实验结果显示,层级贝叶斯时空联合聚类模型优于用于对比的另外两种模型,层级贝叶斯时空联合聚类在使用更少参数的同时达到了更小的负对数似然,同时时间和空间分布的重构准确率也均较高,这是因为该模型对驻留轨迹段的时间和空间分布同时进行优化,而另外两种模型均按照时间、空间的次序进行优化,在时间维度上虽然达到了最优,但时间聚类的结果限制了在空间维度上所能进行优化的极限。 62 | 63 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/t3.png) 64 | 65 | 如图是基于层级贝叶斯时空联合聚类模型的聚类结果。其中,第一行对应的是模式在时间上的概率分布,第二行对应的是模式在空间上的概率分布,一共有五列,每一列都对应了一种不同的行为模式,时间模式子图的横、纵坐标分别为驻留轨迹段的起止时刻,空间模式子图的横、纵坐标分别为根据经纬度坐标转换得到的地理空间网格坐标。由图可见,城市人口行为模式主要可以分为五种模式,分别对应了不同的语义。由图从左至右的五种模式中,模式1的起止时刻分布在时间模式子图的对角线附近,空间位置分布在城市区域中心,可能对应了各种情况下的短暂停留;模式2及模式3的起止时刻分布在时间模式子图对角线的对称区域,空间分布呈现出互补的特性,一者分布在城市中心区域,一者在城市区域中分布较为均匀,分别可推测带有“白天工作”以及“夜间在家”的语义;模式3的起止时刻分布在时间模式子图对角线的右上端处,对应时间为晚上7点至11点,空间分布偏向于城市中心特定区域,可推测带有“夜间休闲娱乐”的语义;模式5对应的行为发生概率最少,起止时刻分布在时间模式子图的对角线的右上端处,但与模式4不同,模式5的起止时刻近似相同,空间分布较为弥散,可以推测为用户位置在天内固定,这可能是由于用户工作地点与居住地点非常临近。如图是在预设挖掘出五种模式的基础上计算得到的结果,如果进一步增加待挖掘出的模式的数量,还可以进一步获得更加细粒度的行为模式。 66 | 67 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/04.png) 68 | -------------------------------------------------------------------------------- /code/examples_bayes/bayes_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import fileinput 4 | import pymc as pm 5 | import numpy as np 6 | 7 | data_tp, data_sp = [], [] 8 | for line in fileinput.input("../../data/stationary.txt"): 9 | part = line.strip().split("\t") 10 | uid, items = part[0], part[1:] 11 | if uid == "460029901722027": 12 | for item in items: 13 | tm, poi = [int(i) for i in item.split(" ")[0:2]], [int(i) for i in item.split(" ")[4].split(",")] 14 | data_tp.append(tm) 15 | data_sp.append(poi) 16 | fileinput.close() 17 | data_tp, data_sp = np.array(data_tp), np.array(data_sp) 18 | print data_tp 19 | print data_sp 20 | 21 | prior = pm.Dirichlet('prior', np.array([50.0,50.0])) 22 | state = pm.Container([pm.Categorical('state_%i' % i, p=prior) for i in range(len(data_tp))]) 23 | stime = pm.Container([pm.DiscreteUniform('stime_%i' % i, lower=0, upper=23) for i in range(2)]) 24 | ftime = pm.Container([pm.DiscreteUniform('ftime_%i' % i, lower=0, upper=23) for i in range(2)]) 25 | @pm.deterministic(plot=False) 26 | def mu_s(state=state, stime=stime): 27 | return np.array([stime[0] if state[i] == 0 else stime[1] for i in xrange(len(data_tp))]) 28 | @pm.deterministic(plot=False) 29 | def mu_f(state=state, stime=ftime): 30 | return np.array([ftime[0] if state[i] == 0 else ftime[1] for i in xrange(len(data_tp))]) 31 | obs_s = pm.Normal('obs_s', mu=mu_s, tau=0.1, value=data_tp[:,0], observed=True) 32 | obs_f = pm.Normal('obs_f', mu=mu_f, tau=0.1, value=data_tp[:,1], observed=True) 33 | model = pm.Model([prior, state, stime, ftime, obs_s, obs_f]) 34 | mcmc = pm.MCMC(model) 35 | mcmc.sample(100) 36 | print state.value 37 | print stime[0].value, ftime[0].value 38 | print stime[1].value, ftime[1].value 39 | 40 | # prior = pm.Dirichlet('prior', np.array([50.0,50.0])) 41 | # state = pm.Container([pm.Categorical('state_%i' % i, p=prior) for i in range(len(data_sp))]) 42 | # poi_1 = pm.Container([pm.DiscreteUniform('poi_1_%i' % i, lower=0, upper=100) for i in range(2)]) 43 | # poi_2 = pm.Container([pm.DiscreteUniform('poi_2_%i' % i, lower=0, upper=100) for i in range(2)]) 44 | # poi_3 = pm.Container([pm.DiscreteUniform('poi_3_%i' % i, lower=0, upper=100) for i in range(2)]) 45 | # poi_4 = pm.Container([pm.DiscreteUniform('poi_4_%i' % i, lower=0, upper=100) for i in range(2)]) 46 | # poi_5 = pm.Container([pm.DiscreteUniform('poi_5_%i' % i, lower=0, upper=100) for i in range(2)]) 47 | # @pm.deterministic(plot=False) 48 | # def mu_1(state=state, poi_1=poi_1): 49 | # return np.array([poi_1[0] if state[i] == 0 else poi_1[1] for i in xrange(len(data_sp))]) 50 | # @pm.deterministic(plot=False) 51 | # def mu_2(state=state, poi_2=poi_2): 52 | # return np.array([poi_2[0] if state[i] == 0 else poi_2[1] for i in xrange(len(data_sp))]) 53 | # @pm.deterministic(plot=False) 54 | # def mu_3(state=state, poi_3=poi_3): 55 | # return np.array([poi_3[0] if state[i] == 0 else poi_3[1] for i in xrange(len(data_sp))]) 56 | # @pm.deterministic(plot=False) 57 | # def mu_4(state=state, poi_4=poi_4): 58 | # return np.array([poi_4[0] if state[i] == 0 else poi_4[1] for i in xrange(len(data_sp))]) 59 | # @pm.deterministic(plot=False) 60 | # def mu_5(state=state, poi_5=poi_5): 61 | # return np.array([poi_5[0] if state[i] == 0 else poi_5[1] for i in xrange(len(data_sp))]) 62 | # obs_1 = pm.Normal('obs_1', mu=mu_1, tau=0.1, value=data_sp[:,0], observed=True) 63 | # obs_2 = pm.Normal('obs_2', mu=mu_2, tau=0.1, value=data_sp[:,1], observed=True) 64 | # obs_3 = pm.Normal('obs_3', mu=mu_3, tau=0.1, value=data_sp[:,2], observed=True) 65 | # obs_4 = pm.Normal('obs_4', mu=mu_4, tau=0.1, value=data_sp[:,3], observed=True) 66 | # obs_5 = pm.Normal('obs_5', mu=mu_5, tau=0.1, value=data_sp[:,4], observed=True) 67 | # model = pm.Model([prior, state, poi_1, poi_2, poi_3, poi_4, poi_5, obs_1, obs_2, obs_3, obs_4, obs_5]) 68 | # mcmc = pm.MCMC(model) 69 | # mcmc.sample(100) 70 | # print state.value 71 | # print poi_1[0].value, poi_2[0].value, poi_3[0].value, poi_4[0].value, poi_5[0].value 72 | # print poi_1[1].value, poi_2[1].value, poi_3[1].value, poi_4[1].value, poi_5[1].value 73 | 74 | prior = pm.Dirichlet('prior', np.array([50.0,50.0])) 75 | state = pm.Container([pm.Categorical('state_%i' % i, p=prior) for i in range(len(data_tp))]) 76 | stime = pm.Container([pm.DiscreteUniform('stime_%i' % i, lower=0, upper=23) for i in range(2)]) 77 | ftime = pm.Container([pm.DiscreteUniform('ftime_%i' % i, lower=0, upper=23) for i in range(2)]) 78 | poi_1 = pm.Container([pm.DiscreteUniform('poi_1_%i' % i, lower=0, upper=100) for i in range(2)]) 79 | poi_2 = pm.Container([pm.DiscreteUniform('poi_2_%i' % i, lower=0, upper=100) for i in range(2)]) 80 | poi_3 = pm.Container([pm.DiscreteUniform('poi_3_%i' % i, lower=0, upper=100) for i in range(2)]) 81 | poi_4 = pm.Container([pm.DiscreteUniform('poi_4_%i' % i, lower=0, upper=100) for i in range(2)]) 82 | poi_5 = pm.Container([pm.DiscreteUniform('poi_5_%i' % i, lower=0, upper=100) for i in range(2)]) 83 | @pm.deterministic(plot=False) 84 | def mu_s(state=state, stime=stime): 85 | return np.array([stime[0] if state[i] == 0 else stime[1] for i in xrange(len(data_tp))]) 86 | @pm.deterministic(plot=False) 87 | def mu_f(state=state, stime=ftime): 88 | return np.array([ftime[0] if state[i] == 0 else ftime[1] for i in xrange(len(data_tp))]) 89 | @pm.deterministic(plot=False) 90 | def mu_1(state=state, poi_1=poi_1): 91 | return np.array([poi_1[0] if state[i] == 0 else poi_1[1] for i in xrange(len(data_sp))]) 92 | @pm.deterministic(plot=False) 93 | def mu_2(state=state, poi_2=poi_2): 94 | return np.array([poi_2[0] if state[i] == 0 else poi_2[1] for i in xrange(len(data_sp))]) 95 | @pm.deterministic(plot=False) 96 | def mu_3(state=state, poi_3=poi_3): 97 | return np.array([poi_3[0] if state[i] == 0 else poi_3[1] for i in xrange(len(data_sp))]) 98 | @pm.deterministic(plot=False) 99 | def mu_4(state=state, poi_4=poi_4): 100 | return np.array([poi_4[0] if state[i] == 0 else poi_4[1] for i in xrange(len(data_sp))]) 101 | @pm.deterministic(plot=False) 102 | def mu_5(state=state, poi_5=poi_5): 103 | return np.array([poi_5[0] if state[i] == 0 else poi_5[1] for i in xrange(len(data_sp))]) 104 | obs_s = pm.Normal('obs_s', mu=mu_s, tau=0.1, value=data_tp[:,0], observed=True) 105 | obs_f = pm.Normal('obs_f', mu=mu_f, tau=0.1, value=data_tp[:,1], observed=True) 106 | obs_1 = pm.Normal('obs_1', mu=mu_1, tau=2, value=data_sp[:,0], observed=True) 107 | obs_2 = pm.Normal('obs_2', mu=mu_2, tau=2, value=data_sp[:,1], observed=True) 108 | obs_3 = pm.Normal('obs_3', mu=mu_3, tau=2, value=data_sp[:,2], observed=True) 109 | obs_4 = pm.Normal('obs_4', mu=mu_4, tau=2, value=data_sp[:,3], observed=True) 110 | obs_5 = pm.Normal('obs_5', mu=mu_5, tau=1, value=data_sp[:,4], observed=True) 111 | model = pm.Model([prior, state, stime, ftime, poi_1, poi_2, poi_3, poi_4, poi_5, obs_s, obs_f, obs_1, obs_2, obs_3, obs_4, obs_5]) 112 | mcmc = pm.MCMC(model) 113 | mcmc.sample(100) 114 | print "state:", state.value 115 | print "stime_0:", stime[0].value, ftime[0].value 116 | print "stime_1:", stime[1].value, ftime[1].value 117 | print "poi_0:", poi_1[0].value, poi_2[0].value, poi_3[0].value, poi_4[0].value, poi_5[0].value 118 | print "poi_1:", poi_1[1].value, poi_2[1].value, poi_3[1].value, poi_4[1].value, poi_5[1].value 119 | -------------------------------------------------------------------------------- /code/model_baseline2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import glob 4 | import math 5 | import json 6 | import random 7 | import fileinput 8 | 9 | # 时间粒度为10分钟 10 | # 空间粒度为200米 11 | # 20-24为工作日 12 | 13 | def euclidean(p1, p2): 14 | return ((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)**0.5 15 | 16 | def gauss(y, mu, sigma): 17 | # print "gauss computation:", y, mu, sigma 18 | return 1./math.sqrt((sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0]))*math.exp(-0.5/(sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0])*(sigma[0][0]*(y[0]-mu[0])**2-(y[0]-mu[0])*(y[1]-mu[1])*(sigma[0][1]+sigma[1][0])+sigma[1][1]*(y[1]-mu[1])**2)) 19 | # print gauss([1,1],[0,0],[[1,0],[0,1]]) 20 | 21 | K, R, alpha, beta, iter_num, data = 5, 20, 0.2, 0.2, 20, [] 22 | 23 | def run_baseline2(): 24 | # 数据准备 25 | mu_t = [[48,108], [108,48], [60,128], [128,60], [72,84]] 26 | sigma_t = [[[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]]] 27 | assert K == len(mu_t) and K == len(sigma_t) 28 | print "Total Cluster: {0}".format(K) 29 | 30 | for line in fileinput.input("../data/stationary.txt"): 31 | st, ft, gx, gy = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]), int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6]) 32 | ds = [((st-mu_t[k][0])**2+(ft-mu_t[k][1])**2)**0.5 for k in xrange(K)] 33 | data.append([st,ft,gx,gy,ds.index(min(ds)),-1]) 34 | fileinput.close() 35 | 36 | # from sklearn import mixture 37 | # est = mixture.GMM(n_components=R, covariacne_type="full") 38 | # est.fit([session[2:4] for session in data]) 39 | # print [[int(i) for i in list(means)] for means in est.means_] 40 | rs = [[165, 87], [78, 68], [77, 59], [82, 98], [46, 94], [86, 68], [37, 65], [69, 57], [77, 78], [92, 25], [44, 14], [71, 84], [66, 79], [61, 70], [71, 28], [14, 128], [100, 75], [75, 63], [85, 34], [54, 76]] 41 | assert R == len(rs) 42 | print "Total Region: {0}".format(R) 43 | 44 | # 初值选取 45 | for sl in data: 46 | rd = [euclidean(sl[2:4], rs[r]) for r in xrange(R)] 47 | sl[-1] = rd.index(min(rd)) 48 | 49 | # 初始化 50 | L = len(data) 51 | len_k = [float(len(filter(lambda x:x[4]==k, data))) for k in xrange(K)] 52 | len_r = [float(len(filter(lambda x:x[5]==r, data))) for r in xrange(R)] 53 | len_k_r = [[float(len(filter(lambda x:x[4]==k and x[5]==r, data))) for r in xrange(R)] for k in xrange(K)] 54 | 55 | mu1_t = [[float(sum(map(lambda x:x[0],filter(lambda x:x[4]==k, data)))), float(sum(map(lambda x:x[1],filter(lambda x:x[4]==k, data))))] for k in xrange(K)] 56 | mu2_t = [[float(sum(map(lambda x:x[0]**2,filter(lambda x:x[4]==k, data)))), float(sum(map(lambda x:x[1]**2,filter(lambda x:x[4]==k, data)))), float(sum(map(lambda x:x[0]*x[1],filter(lambda x:x[4]==k, data))))] for k in xrange(K)] 57 | mu_t = [[mu1_t[k][0]/len_k[k], mu1_t[k][1]/len_k[k]] for k in xrange(K)] 58 | sigma_t = [[[mu2_t[k][0]/len_k[k]-mu_t[k][0]**2,(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1]],[(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1],mu2_t[k][1]/len_k[k]-mu_t[k][1]**2]] for k in xrange(K)] 59 | 60 | mu1_r = [[float(sum(map(lambda x:x[2],filter(lambda x:x[5]==r, data)))), float(sum(map(lambda x:x[3],filter(lambda x:x[5]==r, data))))] for r in xrange(R)] 61 | mu2_r = [[float(sum(map(lambda x:x[2]**2,filter(lambda x:x[5]==r, data)))), float(sum(map(lambda x:x[3]**2,filter(lambda x:x[5]==r, data)))), float(sum(map(lambda x:x[2]*x[3],filter(lambda x:x[5]==r, data))))] for r in xrange(R)] 62 | mu_r = [[mu1_r[r][0]/len_r[r], mu1_r[r][1]/len_r[r]] for r in xrange(R)] 63 | sigma_r = [[[mu2_r[r][0]/len_r[r]-mu_r[r][0]**2,(mu2_r[r][2]-mu_r[r][0]*mu1_r[r][1]-mu_r[r][1]*mu1_r[r][0])/len_r[r]+mu_r[r][0]*mu_r[r][1]],[(mu2_r[r][2]-mu_r[r][0]*mu1_r[r][1]-mu_r[r][1]*mu1_r[r][0])/len_r[r]+mu_r[r][0]*mu_r[r][1],mu2_r[r][1]/len_r[r]-mu_r[r][1]**2]] for r in xrange(R)] 64 | 65 | # 迭代计算 66 | for iter_curr in xrange(iter_num): 67 | likelihood = 0 68 | for i in xrange(L): 69 | item, co = data[i], data[i][4] 70 | # sample R 71 | prob = [1.*len_k_r[co][r]/len_k[co]*gauss(item[2:4],mu_r[r],sigma_r[r]) for r in xrange(R)] 72 | ro, rn = item[5], prob.index(max(prob)) 73 | if rn != ro: 74 | data[i][5] = rn 75 | len_r[ro] -= 1; len_r[rn] += 1 76 | len_k_r[co][ro] -= 1; len_k_r[co][rn] += 1 77 | mu1_r[ro][0] -= item[2]; mu1_r[ro][1] -= item[3] 78 | mu1_r[rn][0] += item[2]; mu1_r[rn][1] += item[3] 79 | mu2_r[ro][0] -= item[2]**2; mu2_r[ro][1] -= item[3]**2; mu2_r[ro][2] -= item[2]*item[3] 80 | mu2_r[rn][0] += item[2]**2; mu2_r[rn][1] += item[3]**2; mu2_r[rn][2] += item[2]*item[3] 81 | mu_r = [[mu1_r[r][0]/len_r[r], mu1_r[r][1]/len_r[r]] for r in xrange(R)] 82 | sigma_r = [[[mu2_r[r][0]/len_r[r]-mu_r[r][0]**2,(mu2_r[r][2]-mu_r[r][0]*mu1_r[r][1]-mu_r[r][1]*mu1_r[r][0])/len_r[r]+mu_r[r][0]*mu_r[r][1]],[(mu2_r[r][2]-mu_r[r][0]*mu1_r[r][1]-mu_r[r][1]*mu1_r[r][0])/len_r[r]+mu_r[r][0]*mu_r[r][1],mu2_r[r][1]/len_r[r]-mu_r[r][1]**2]] for r in xrange(R)] 83 | # sample K 84 | prob = [1.*len_k_r[k][rn]/L*gauss(item[:2],mu_t[k],sigma_t[k])*gauss(item[2:4],mu_r[rn],sigma_r[rn]) for k in xrange(K)] 85 | cn = prob.index(max(prob)) 86 | if cn != co: 87 | data[i][4] = cn 88 | len_k[co] -= 1; len_k[cn] += 1 89 | len_k_r[co][rn] -= 1; len_k_r[cn][rn] += 1 90 | mu1_t[co][0] -= item[0]; mu1_t[co][1] -= item[1] 91 | mu1_t[cn][0] += item[0]; mu1_t[cn][1] += item[1] 92 | mu2_t[co][0] -= item[0]**2; mu2_t[co][1] -= item[1]**2; mu2_t[co][2] -= item[0]*item[1] 93 | mu2_t[cn][0] += item[0]**2; mu2_t[cn][1] += item[1]**2; mu2_t[cn][2] += item[0]*item[1] 94 | mu_t = [[mu1_t[k][0]/len_k[k], mu1_t[k][1]/len_k[k]] for k in xrange(K)] 95 | sigma_t = [[[mu2_t[k][0]/len_k[k]-mu_t[k][0]**2,(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1]],[(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1],mu2_t[k][1]/len_k[k]-mu_t[k][1]**2]] for k in xrange(K)] 96 | 97 | prob_max = 1.*len_k[cn]/sum(len_k)*len_k_r[cn][rn]/sum(len_k_r[cn])*gauss(item[:2],mu_t[cn],sigma_t[cn])*gauss(item[2:4],mu_r[rn],sigma_r[rn]) 98 | likelihood += -math.log10(prob_max) 99 | 100 | print iter_curr, likelihood 101 | 102 | with open('model_save/baseline2.txt','w') as f: 103 | f.write(json.dumps({"len_k":len_k, 104 | "len_k_r":len_k_r, 105 | "mu_t":mu_t, 106 | "sigma_t":sigma_t, 107 | "mu_r":mu_r, 108 | "sigma_r":sigma_r})) 109 | 110 | def compute_error(): 111 | import numpy as np 112 | 113 | param = json.loads(open('model_save/baseline2.txt','r').read()) 114 | len_k, len_k_r = param['len_k'], param['len_k_r'] 115 | mu_t, sigma_t = param['mu_t'], param['sigma_t'] 116 | mu_r, sigma_r = param['mu_r'], param['sigma_r'] 117 | 118 | # 时间分布 119 | matrix1 = [[0 for j in xrange(24*6)] for i in xrange(24*6)] 120 | for line in fileinput.input("../data/stationary.txt"): 121 | st, ft = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]) 122 | matrix1[st][ft] += 1 123 | fileinput.close() 124 | matrix2 = [[0 for j in xrange(24*6)] for i in xrange(24*6)] 125 | for k in xrange(K): 126 | for st in xrange(24*6): 127 | for ft in xrange(24*6): 128 | matrix2[st][ft] += 1.*(len_k[k]/sum(len_k))*gauss([st,ft],mu_t[k],sigma_t[k]) 129 | matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum() 130 | matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum() 131 | print "Temporal reconstruction accuracy:", 1-(abs(matrix1-matrix2)[40:60,100:140].sum()+abs(matrix1-matrix2)[100:140,40:60].sum()) 132 | 133 | # 空间分布 134 | matrix1 = [[0 for j in xrange(150)] for i in xrange(225)] 135 | for line in fileinput.input("../data/stationary.txt"): 136 | gx, gy = int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6]) 137 | matrix1[gx][gy] += 1 138 | fileinput.close() 139 | matrix2 = [[0 for j in xrange(150)] for i in xrange(225)] 140 | for k in xrange(K): 141 | for r in xrange(R): 142 | for gx in xrange(225): 143 | for gy in xrange(150): 144 | matrix2[gx][gy] += 1.*(len_k[k]/sum(len_k))*(len_k_r[k][r]/sum(len_k_r[k]))*gauss([gx,gy],mu_r[r],sigma_r[r]) 145 | matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum() 146 | matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum() 147 | print "Spatial reconstruction accuracy:", 1-abs(matrix1-matrix2)[50:90,50:90].sum() 148 | 149 | 150 | if __name__ == "__main__": 151 | # run_baseline2() 152 | compute_error() 153 | -------------------------------------------------------------------------------- /code/stationary_segmentation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import gzip 4 | import fileinput 5 | 6 | # 时间粒度为10分钟 7 | # 空间粒度为200米 8 | # 20-24为工作日 9 | 10 | def euclidean(p1, p2): 11 | return 200*((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)**0.5 12 | 13 | # 轨迹段分布 14 | def plot_segmentation_distribution(): 15 | from pylab import * 16 | from scipy import interpolate 17 | from matplotlib.ticker import MultipleLocator, FormatStrFormatter 18 | 19 | fig = plt.figure(figsize=(10,5)) 20 | fig.subplots_adjust(left=0.05,right=0.98) 21 | 22 | users, delta = {}, 2 23 | for line in fileinput.input("../data/stationary.txt"): 24 | uid, _, _, st, ft, _, _ = line.strip().split(" ") # 时间粒度为十分钟 25 | st, ft = int(st), int(ft) 26 | users[uid] = users.get(uid,[]) 27 | day = users[uid][-1][3]/(24*6)+(users[uid][-1][1]>=st) if users[uid] else 0 28 | if len(users[uid]) == 0 or (24*6)*day+st - users[uid][-1][3] >= delta*6: 29 | users[uid].append([st, ft, (24*6)*day+st, (24*6)*(day+(st>ft))+ft, (ft>st)]) 30 | fileinput.close() 31 | 32 | day_total, row_total = 6, 60 33 | matrixs, row = [[0 for h in xrange(day_total*24*6)] for u in xrange(row_total)], -1 34 | for uid, slices in users.iteritems(): 35 | if slices[-1][-2] <= (day_total-1)*24*6 or slices[-1][-2] >= day_total*24*6-1: 36 | continue 37 | row += 1 38 | if row == row_total: 39 | break 40 | for sl in slices: 41 | color = -1 if sl[-1] else 1 42 | for h in xrange(sl[2],sl[3]+1): 43 | matrixs[row][h] = color 44 | 45 | ax = fig.add_subplot(121) 46 | (X, Y) = meshgrid(np.arange(day_total*24*6), np.arange(row_total)) 47 | C = np.array(matrixs) 48 | plt.pcolormesh(X, Y, C, cmap='RdBu', vmin=-2, vmax=2) 49 | plt.xlim(0,day_total*24*6-1) 50 | plt.ylim(0,row_total-1) 51 | xmajorLocator = MultipleLocator(24*6) 52 | xmajorFormatter = FormatStrFormatter('%d') 53 | ax.xaxis.set_major_locator(xmajorLocator) 54 | ax.xaxis.set_major_formatter(xmajorFormatter) 55 | plt.xlabel('Time slot /10min') 56 | plt.ylabel('User') 57 | 58 | users, delta = {}, 2 59 | for line in fileinput.input("../data/stationary.txt"): 60 | uid, _, _, st, ft, _, _ = line.strip().split(" ") # 时间粒度为十分钟 61 | st, ft = int(st), int(ft) 62 | users[uid] = users.get(uid,[]) 63 | day = users[uid][-1][3]/(24*6)+(users[uid][-1][1]>=st) if users[uid] else 0 64 | users[uid].append([st, ft, (24*6)*day+st, (24*6)*(day+(st>ft))+ft, (ft>st)]) 65 | fileinput.close() 66 | 67 | distribution = {} 68 | for uid, slices in users.iteritems(): 69 | for i in xrange(1,len(slices)): 70 | interval = (slices[i][2]-slices[i-1][3])/3 71 | distribution[interval] = distribution.get(interval,0)+1 72 | distribution = [distribution.get(t,0) for t in xrange(2*12)] 73 | distribution = [1-1.*sum(distribution[t:])/sum(distribution) for t in xrange(2*12)] 74 | ax1 = fig.add_subplot(122) 75 | tck = interpolate.splrep(range(len(distribution)),distribution,s=0) 76 | xnew = np.arange(0,2*12,0.1) 77 | ynew = interpolate.splev(xnew,tck,der=0) 78 | plt.plot(xnew,ynew,'k-',label="Interval",linewidth=2) 79 | plt.xlim(1,12) 80 | plt.ylim(0,1.) 81 | plt.xlabel('Time slot /30min') 82 | plt.ylabel('CDF') 83 | # handles, labels = ax1.get_legend_handles_labels() 84 | # ax1.legend(handles, labels) 85 | xmajorLocator = MultipleLocator(1) 86 | xmajorFormatter = FormatStrFormatter('%d') 87 | ax1.xaxis.set_major_locator(xmajorLocator) 88 | ax1.xaxis.set_major_formatter(xmajorFormatter) 89 | # show() 90 | for postfix in ('eps','png'): 91 | savefig('../figure/{0}/04.{0}'.format(postfix)) 92 | 93 | # 阈值距离为1000米,时间为1小时,仅筛选出工作日 94 | def stationary_accurate_detection(): 95 | min_distance, min_duration, max_duration, min_session = 1000, 1*60/10, 1*60/10, 10 96 | with open("../data/stationary_accurate.txt", "w") as f: 97 | line_num = 0 98 | for line in gzip.open("../data/trace.txt.gz"): 99 | line_num += 1 100 | print line_num 101 | uid = line.strip().split("\t")[0] 102 | session_list, session_current, slices = [], [], [(int(sl.split(":")[0]), \ 103 | int(sl.split(":")[1]), \ 104 | sum([int(p.split(",")[0]) for p in sl.split(":")[2].split("-")])/len(sl.split(":")[2].split("-")), \ 105 | sum([int(p.split(",")[1]) for p in sl.split(":")[2].split("-")])/len(sl.split(":")[2].split("-"))) \ 106 | for sl in line.strip().split("\t")[1].split("|")] 107 | for sl in slices: 108 | if len(session_current) == 0: 109 | session_current = [sl] 110 | else: 111 | if euclidean(sl[2:],session_current[-1][2:]) >= min_distance: 112 | if session_current[-1][0]-session_current[0][0] >= min_duration and sl[0]-session_current[-1][0] <= max_duration: 113 | session_list.append(session_current) 114 | session_current = [sl] 115 | else: 116 | session_current.append(sl) 117 | if len(session_list) >= min_session: 118 | for i in range(1,len(session_list)-1): 119 | if len(session_list[i]) >= 2 and 1*24*60/10 < session_list[i][-1][0] and session_list[i][0][0] < 6*24*60/10: 120 | f.write(uid+" "+str(round(float(session_list[i][0][0]%(24*60/10))/(60/10),2))+" "+\ 121 | str(round(float(session_list[i][-1][0]%(24*60/10))/(60/10),2))+" "+\ 122 | str(session_list[i][0][0]%(24*60/10))+" "+\ 123 | str(session_list[i][-1][0]%(24*60/10))+" "+\ 124 | str(sum([session[2] for session in session_list[i]])/len(session_list[i]))+" "+\ 125 | str(sum([session[3] for session in session_list[i]])/len(session_list[i]))+"\n") 126 | 127 | def segmentation_detection(function="", method="probability"): 128 | if function == "segment" and not method in ["probability", "median","cut"]: 129 | exit() 130 | 131 | def uniform_prob(N): 132 | prob = [0]*(N) 133 | for i in xrange(N): 134 | for j in xrange(i,N): 135 | if (i+j)%2 == 0: 136 | prob[int(1.0*(i+j)/2)] += 1 137 | else: 138 | prob[int(1.0*(i+j)/2-0.5)] += 0.5 139 | prob[int(1.0*(i+j)/2+0.5)] += 0.5 140 | return [prob[i]/sum(prob) for i in xrange(N)] 141 | 142 | import random 143 | global_prob, local_prob, user_prob, valid_set, delta = {}, {}, {}, [], 5*3 144 | for line in fileinput.input("../data/stationary_accurate.txt"): 145 | if function == "plot": 146 | uid, st, ft, _, _, gx, gy = line.strip().split(" ") # 时间粒度为每小时 147 | elif function == "segment": 148 | uid, _, _, st, ft, gx, gy = line.strip().split(" ") # 时间粒度为十分钟 149 | else: 150 | exit() 151 | st, ft, gx, gy = int(float(st)), int(float(ft)), int(gx), int(gy) 152 | global_prob[st] = global_prob.get(st,0)+1 153 | global_prob[ft] = global_prob.get(ft,0)+1 154 | local_prob[(gx, gy)] = local_prob.get((gx, gy),{}) 155 | local_prob[(gx, gy)][st] = local_prob[(gx, gy)].get(st,0)+1 156 | local_prob[(gx, gy)][ft] = local_prob[(gx, gy)].get(ft,0)+1 157 | user_prob[uid] = user_prob.get(uid,{}) 158 | user_prob[uid][st] = user_prob[uid].get(st,0)+1 159 | user_prob[uid][ft] = user_prob[uid].get(ft,0)+1 160 | valid_set.append([uid,st,ft,gx,gy,\ 161 | max(st-random.randint(0,delta),0),\ 162 | min(st+random.randint(0,delta),24*6-1),\ 163 | max(ft-random.randint(0,delta),0),\ 164 | min(ft+random.randint(0,delta),24*6-1)]) 165 | fileinput.close() 166 | 167 | # 概率时间分布(时间粒度为每小时) 168 | if function == "plot": 169 | import matplotlib.pyplot as plt 170 | line, = plt.plot(range(24), [global_prob[h] for h in range(24)], '-', linewidth=2) 171 | show() 172 | for gx in xrange(255): 173 | for gy in xrange(150): 174 | if (gx, gy) in local_prob and len(local_prob[(gx, gy)]) == 24: 175 | line, = plt.plot(range(24), [local_prob[(gx, gy)].get(h,0) for h in xrange(24)], '-', linewidth=2) 176 | show() 177 | 178 | # 切分点预测(时间粒度为十分钟) 179 | if function == "segment": 180 | alpha_global, alpha_local, alpha_user, error = 0.4, 0.4, 0.4, 0.0 181 | for uid,st,ft,gx,gy,stb,ste,ftb,fte in valid_set: 182 | if method == "probability": 183 | probs_global = [global_prob.get(h,0)+(global_prob.get(h-1,0)+global_prob.get(h+1,0))*0.5 184 | for h in range(stb,ste+1)] 185 | probs_local = [local_prob.get((gx,gy),{}).get(h,0)+(local_prob.get((gx,gy),{}).get(h-1,0)+local_prob.get((gx,gy),{}).get(h+1,0))*0.5 186 | for h in range(stb,ste+1)] 187 | probs_user = [user_prob.get(uid,{}).get(h,0)+(user_prob.get(uid,{}).get(h-1,0)+user_prob.get(uid,{}).get(h+1,0))*0.5 188 | for h in range(stb,ste+1)] 189 | probs_global = [(1-alpha_global)*prob/sum(probs_global)+alpha_global/len(probs_global) for prob in probs_global] 190 | probs_local = [(1-alpha_local)*prob/sum(probs_local)+alpha_local/len(probs_local) for prob in probs_local] 191 | probs_user = [(1-alpha_user)*prob/sum(probs_user)+alpha_user/len(probs_user) for prob in probs_user] 192 | probs_uniform = uniform_prob(ste-stb+1) 193 | probs = [probs_uniform[h]+probs_global[h]+probs_local[h]+probs_user[h] for h in range(ste-stb+1)] 194 | error += abs(st-(stb+probs.index(max(probs)))) 195 | elif method == "median": 196 | error += abs(st-round(1.0*(stb+ste)/2,0)) 197 | elif method == "cut": 198 | error += abs(st-round(1.0*ste,0)) 199 | 200 | print "method={0}, MAE={1}".format(method, 10*(error/len(valid_set))) 201 | 202 | 203 | if __name__ == "__main__": 204 | plot_segmentation_distribution() 205 | # stationary_accurate_detection() 206 | # segmentation_detection("plot") 207 | # segmentation_detection("segment","probability") 208 | # segmentation_detection("segment","median") 209 | # segmentation_detection("segment","cut") 210 | 211 | # 2*30min 212 | # method=probability, MAE=5.80136525334 213 | # method=median, MAE=11.3674646669 214 | # method=cut, MAE=29.5907124315 215 | # 3*30min 216 | # method=probability, MAE=8.06143640035 217 | # method=median, MAE=16.5051437362 218 | # method=cut, MAE=44.0982597827 219 | # 4*30min 220 | # method=probability, MAE=11.9503893856 221 | # method=median, MAE=21.4089029901 222 | # method=cut, MAE=58.3183347755 223 | # 5*30min 224 | # method=probability, MAE=15.1259494279 225 | # method=median, MAE=28.3938082877 226 | # method=cut, MAE=72.5562926642 227 | 228 | -------------------------------------------------------------------------------- /code/examples_sklearn/sk_gmm.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import matplotlib as mpl 5 | from scipy import linalg 6 | from sklearn import cluster 7 | 8 | EPS = np.finfo(float).eps 9 | 10 | def log_multivariate_normal_density(X, means, covars, covariance_type='diag'): 11 | log_multivariate_normal_density_dict = { 12 | 'spherical': _log_multivariate_normal_density_spherical, 13 | 'tied': _log_multivariate_normal_density_tied, 14 | 'diag': _log_multivariate_normal_density_diag, 15 | 'full': _log_multivariate_normal_density_full} 16 | return log_multivariate_normal_density_dict[covariance_type](X, means, covars) 17 | 18 | def distribute_covar_matrix_to_match_covariance_type(tied_cv, covariance_type, n_components): 19 | if covariance_type == 'spherical': 20 | cv = np.tile(tied_cv.mean() * np.ones(tied_cv.shape[1]), (n_components, 1)) 21 | elif covariance_type == 'tied': 22 | cv = tied_cv 23 | elif covariance_type == 'diag': 24 | cv = np.tile(np.diag(tied_cv), (n_components, 1)) 25 | elif covariance_type == 'full': 26 | cv = np.tile(tied_cv, (n_components, 1, 1)) 27 | else: 28 | raise ValueError("covariance_type must be one of 'spherical', 'tied', 'diag', 'full'") 29 | return cv 30 | 31 | def _covar_mstep_spherical(*args): 32 | cv = _covar_mstep_diag(*args) 33 | return np.tile(cv.mean(axis=1)[:, np.newaxis], (1, cv.shape[1])) 34 | 35 | def _covar_mstep_diag(gmm, X, responsibilities, weighted_X_sum, norm, min_covar): 36 | avg_X2 = np.dot(responsibilities.T, X * X) * norm 37 | avg_means2 = gmm.means_ ** 2 38 | avg_X_means = gmm.means_ * weighted_X_sum * norm 39 | return avg_X2 - 2 * avg_X_means + avg_means2 + min_covar 40 | 41 | def _covar_mstep_tied(gmm, X, responsibilities, weighted_X_sum, norm, min_covar): 42 | n_features = X.shape[1] 43 | avg_X2 = np.dot(X.T, X) 44 | avg_means2 = np.dot(gmm.means_.T, weighted_X_sum) 45 | return (avg_X2 - avg_means2 + min_covar * np.eye(n_features)) / X.shape[0] 46 | 47 | def _covar_mstep_full(gmm, X, responsibilities, weighted_X_sum, norm, min_covar): 48 | n_features = X.shape[1] 49 | cv = np.empty((gmm.n_components, n_features, n_features)) 50 | for c in range(gmm.n_components): 51 | post = responsibilities[:, c] 52 | np.seterr(under='ignore') 53 | avg_cv = np.dot(post * X.T, X) / (post.sum() + 10 * EPS) 54 | mu = gmm.means_[c][np.newaxis] 55 | cv[c] = (avg_cv - np.dot(mu.T, mu) + min_covar * np.eye(n_features)) 56 | return cv 57 | 58 | _covar_mstep_funcs = {'spherical': _covar_mstep_spherical, 59 | 'diag': _covar_mstep_diag, 60 | 'tied': _covar_mstep_tied, 61 | 'full': _covar_mstep_full} 62 | 63 | def _log_multivariate_normal_density_spherical(X, means, covars): 64 | cv = covars.copy() 65 | if covars.ndim == 1: 66 | cv = cv[:, np.newaxis] 67 | if covars.shape[1] == 1: 68 | cv = np.tile(cv, (1, X.shape[-1])) 69 | return _log_multivariate_normal_density_diag(X, means, cv) 70 | 71 | def _log_multivariate_normal_density_tied(X, means, covars): 72 | n_samples, n_dim = X.shape 73 | icv = pinvh(covars) 74 | lpr = -0.5 * (n_dim * np.log(2 * np.pi) + np.log(linalg.det(covars) + 0.1) + np.sum(X * np.dot(X, icv), 1)[:, np.newaxis] - 2 * np.dot(np.dot(X, icv), means.T) + np.sum(means * np.dot(means, icv), 1)) 75 | return lpr 76 | 77 | def _log_multivariate_normal_density_diag(X, means, covars): 78 | n_samples, n_dim = X.shape 79 | lpr = -0.5 * (n_dim * np.log(2 * np.pi) + np.sum(np.log(covars), 1) + np.sum((means ** 2) / covars, 1) - 2 * np.dot(X, (means / covars).T) + np.dot(X ** 2, (1.0 / covars).T)) 80 | return lpr 81 | 82 | def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7): 83 | n_samples, n_dim = X.shape 84 | nmix = len(means) 85 | log_prob = np.empty((n_samples, nmix)) 86 | for c, (mu, cv) in enumerate(zip(means, covars)): 87 | try: 88 | cv_chol = linalg.cholesky(cv, lower=True) 89 | except linalg.LinAlgError: 90 | cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim), lower=True) 91 | cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol))) 92 | cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T 93 | log_prob[:, c] = - .5 * (np.sum(cv_sol ** 2, axis=1) + n_dim * np.log(2 * np.pi) + cv_log_det) 94 | return log_prob 95 | 96 | def logsumexp(arr, axis=0): 97 | arr = np.rollaxis(arr, axis) 98 | vmax = arr.max(axis=0) 99 | out = np.log(np.sum(np.exp(arr - vmax), axis=0)) 100 | out += vmax 101 | return out 102 | 103 | def pinvh(a, cond=None, rcond=None, lower=True): 104 | a = np.asarray_chkfinite(a) 105 | s, u = linalg.eigh(a, lower=lower) 106 | if rcond is not None: 107 | cond = rcond 108 | if cond in [None, -1]: 109 | t = u.dtype.char.lower() 110 | factor = {'f': 1E3, 'd': 1E6} 111 | cond = factor[t] * np.finfo(t).eps 112 | above_cutoff = (abs(s) > cond * np.max(abs(s))) 113 | psigma_diag = np.zeros_like(s) 114 | psigma_diag[above_cutoff] = 1.0 / s[above_cutoff] 115 | return np.dot(u * psigma_diag, np.conjugate(u).T) 116 | 117 | class GMM(): 118 | def __init__(self, n_components=1, covariance_type='diag', thresh=1e-2, min_covar=1e-3, n_iter=100, n_init=1, params='wmc', init_params='wmc'): 119 | self.n_components = n_components 120 | self.covariance_type = covariance_type 121 | self.thresh = thresh 122 | self.min_covar = min_covar 123 | self.n_iter = n_iter 124 | self.n_init = n_init 125 | self.params = params 126 | self.init_params = init_params 127 | if not covariance_type in ['spherical', 'tied', 'diag', 'full']: 128 | raise ValueError('Invalid value for covariance_type: %s' % covariance_type) 129 | if n_init < 1: 130 | raise ValueError('GMM estimation requires at least one run') 131 | self.weights_ = np.ones(self.n_components) / self.n_components 132 | self.converged_ = False 133 | 134 | def _get_covars(self): 135 | if self.covariance_type == 'full': 136 | return self.covars_ 137 | elif self.covariance_type == 'diag': 138 | return [np.diag(cov) for cov in self.covars_] 139 | elif self.covariance_type == 'tied': 140 | return [self.covars_] * self.n_components 141 | elif self.covariance_type == 'spherical': 142 | return [np.diag(cov) for cov in self.covars_] 143 | 144 | def score_samples(self, X): 145 | X = np.asarray(X) 146 | if X.ndim == 1: 147 | X = X[:, np.newaxis] 148 | if X.size == 0: 149 | return np.array([]), np.empty((0, self.n_components)) 150 | if X.shape[1] != self.means_.shape[1]: 151 | raise ValueError('The shape of X is not compatible with self') 152 | lpr = (log_multivariate_normal_density(X, self.means_, self.covars_, self.covariance_type) + np.log(self.weights_)) 153 | logprob = logsumexp(lpr, axis=1) 154 | responsibilities = np.exp(lpr - logprob[:, np.newaxis]) 155 | return logprob, responsibilities 156 | 157 | def score(self, X): 158 | logprob, _ = self.score_samples(X) 159 | return logprob 160 | 161 | def predict(self, X): 162 | logprob, responsibilities = self.score_samples(X) 163 | return responsibilities.argmax(axis=1) 164 | 165 | def predict_proba(self, X): 166 | logprob, responsibilities = self.score_samples(X) 167 | return responsibilities 168 | 169 | def fit(self, X): 170 | X = np.asarray(X, dtype=np.float) 171 | if X.ndim == 1: 172 | X = X[:, np.newaxis] 173 | if X.shape[0] < self.n_components: 174 | raise ValueError('GMM estimation with %s components, but got only %s samples' % (self.n_components, X.shape[0])) 175 | max_log_prob = -np.infty 176 | print self.init_params 177 | for _ in range(self.n_init): 178 | if 'm' in self.init_params or not hasattr(self, 'means_'): 179 | self.means_ = cluster.KMeans(n_clusters=self.n_components).fit(X).cluster_centers_ 180 | if 'w' in self.init_params or not hasattr(self, 'weights_'): 181 | self.weights_ = np.tile(1.0 / self.n_components, self.n_components) 182 | if 'c' in self.init_params or not hasattr(self, 'covars_'): 183 | cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1]) 184 | if not cv.shape: 185 | cv.shape = (1, 1) 186 | self.covars_ = distribute_covar_matrix_to_match_covariance_type(cv, self.covariance_type, self.n_components) 187 | # EM algorithms 188 | log_likelihood = [] 189 | self.converged_ = False 190 | for i in range(self.n_iter): 191 | # Expectation step 192 | curr_log_likelihood, responsibilities = self.score_samples(X) 193 | log_likelihood.append(curr_log_likelihood.sum()) 194 | # Check for convergence 195 | if i > 0 and abs(log_likelihood[-1] - log_likelihood[-2]) < self.thresh: 196 | self.converged_ = True 197 | break 198 | # Maximization step 199 | self._do_mstep(X, responsibilities, self.params, self.min_covar) 200 | # if the results are better, keep it 201 | if self.n_iter: 202 | if log_likelihood[-1] > max_log_prob: 203 | max_log_prob = log_likelihood[-1] 204 | best_params = {'weights': self.weights_, 'means': self.means_, 'covars': self.covars_} 205 | # check the existence of an init param that was not subject to likelihood computation issue 206 | if np.isneginf(max_log_prob) and self.n_iter: 207 | raise RuntimeError("EM algorithm was never able to compute a valid likelihood given initial parameters. Try different init parameters (or increasing n_init) or check for degenerate data.") 208 | if self.n_iter: 209 | self.covars_ = best_params['covars'] 210 | self.means_ = best_params['means'] 211 | self.weights_ = best_params['weights'] 212 | return self 213 | 214 | def _do_mstep(self, X, responsibilities, params, min_covar=0): 215 | weights = responsibilities.sum(axis=0) 216 | weighted_X_sum = np.dot(responsibilities.T, X) 217 | inverse_weights = 1.0 / (weights[:, np.newaxis] + 10 * EPS) 218 | if 'w' in params: 219 | self.weights_ = (weights / (weights.sum() + 10 * EPS) + EPS) 220 | if 'm' in params: 221 | self.means_ = weighted_X_sum * inverse_weights 222 | if 'c' in params: 223 | covar_mstep_func = _covar_mstep_funcs[self.covariance_type] 224 | self.covars_ = covar_mstep_func( 225 | self, X, responsibilities, weighted_X_sum, inverse_weights, 226 | min_covar) 227 | return weights 228 | 229 | def _n_parameters(self): 230 | ndim = self.means_.shape[1] 231 | if self.covariance_type == 'full': 232 | cov_params = self.n_components * ndim * (ndim + 1) / 2. 233 | elif self.covariance_type == 'diag': 234 | cov_params = self.n_components * ndim 235 | elif self.covariance_type == 'tied': 236 | cov_params = ndim * (ndim + 1) / 2. 237 | elif self.covariance_type == 'spherical': 238 | cov_params = self.n_components 239 | mean_params = ndim * self.n_components 240 | return int(cov_params + mean_params + self.n_components - 1) 241 | 242 | def bic(self, X): 243 | return (-2 * self.score(X).sum() + self._n_parameters() * np.log(X.shape[0])) 244 | 245 | def aic(self, X): 246 | return - 2 * self.score(X).sum() + 2 * self._n_parameters() 247 | 248 | n_samples = 500 249 | np.random.seed(0) 250 | C = np.array([[0., -0.1], [1.7, .4]]) 251 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] 252 | gmm = GMM(n_components=2, covariance_type='spherical') 253 | gmm.fit(X) 254 | clf, title = gmm, 'GMM' 255 | splot = plt.subplot(1, 1, 1) 256 | Y_ = clf.predict(X) 257 | for i, (mean, covar, color) in enumerate(zip(clf.means_, clf._get_covars(), itertools.cycle(['r', 'g']))): 258 | v, w = linalg.eigh(covar) 259 | u = w[0] / linalg.norm(w[0]) 260 | plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) 261 | angle = np.arctan(u[1] / u[0]) 262 | angle = 180 * angle / np.pi 263 | ell = mpl.patches.Ellipse(mean, v[0]*3, v[1]*3, 180 + angle, color=color) 264 | ell.set_clip_box(splot.bbox) 265 | ell.set_alpha(0.5) 266 | splot.add_artist(ell) 267 | plt.xlim(-10, 10) 268 | plt.ylim(-3, 6) 269 | plt.xticks(()) 270 | plt.yticks(()) 271 | plt.title(title) 272 | plt.show() 273 | -------------------------------------------------------------------------------- /code/model_combine.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import glob 4 | import math 5 | import json 6 | import random 7 | import fileinput 8 | 9 | # 时间粒度为10分钟 10 | # 空间粒度为200米 11 | # 20-24为工作日 12 | 13 | def euclidean(p1, p2): 14 | return ((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)**0.5 15 | 16 | def gauss(y, mu, sigma): 17 | # print "gauss computation:", y, mu, sigma 18 | return 1./math.sqrt((sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0]))*math.exp(-0.5/(sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0])*(sigma[0][0]*(y[0]-mu[0])**2-(y[0]-mu[0])*(y[1]-mu[1])*(sigma[0][1]+sigma[1][0])+sigma[1][1]*(y[1]-mu[1])**2)) 19 | # print gauss([1,1],[0,0],[[1,0],[0,1]]) 20 | 21 | K, R, alpha, beta, iter_num, data = 5, 20, 0.2, 0.2, 20, [] 22 | 23 | def run_model(save=True): 24 | # 数据准备 25 | mu_t = [[48,108], [108,48], [60,128], [128,60], [72,84]] 26 | sigma_t = [[[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]]] 27 | assert K == len(mu_t) and K == len(sigma_t) 28 | print "Total Cluster: {0}".format(K) 29 | 30 | for line in fileinput.input("../data/stationary.txt"): 31 | st, ft, gx, gy = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]), int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6]) 32 | ds = [((st-mu_t[k][0])**2+(ft-mu_t[k][1])**2)**0.5 for k in xrange(K)] 33 | data.append([st,ft,gx,gy,ds.index(min(ds)),-1]) 34 | fileinput.close() 35 | 36 | # from sklearn import mixture 37 | # est = mixture.GMM(n_components=R, covariance_type="full") 38 | # est.fit([session[2:4] for session in data]) 39 | # print [[int(i) for i in list(means)] for means in est.means_] 40 | rs = [[165, 87], [78, 68], [77, 59], [82, 98], [46, 94], [86, 68], [37, 65], [69, 57], [77, 78], [92, 25], [44, 14], [71, 84], [66, 79], [61, 70], [71, 28], [14, 128], [100, 75], [75, 63], [85, 34], [54, 76]] 41 | assert R == len(rs) 42 | print "Total Region: {0}".format(R) 43 | 44 | # 初值选取 45 | for sl in data: 46 | rd = [euclidean(sl[2:4], rs[r]) for r in xrange(R)] 47 | sl[-1] = rd.index(min(rd)) 48 | 49 | # 初始化 50 | L = len(data) 51 | len_k = [float(len(filter(lambda x:x[4]==k, data))) for k in xrange(K)] 52 | len_k_r = [[float(len(filter(lambda x:x[4]==k and x[5]==r, data))) \ 53 | for r in xrange(R)] for k in xrange(K)] 54 | 55 | mu1_t = [[float(sum(map(lambda x:x[0],filter(lambda x:x[4]==k, data)))), \ 56 | float(sum(map(lambda x:x[1],filter(lambda x:x[4]==k, data))))] \ 57 | for k in xrange(K)] 58 | mu2_t = [[float(sum(map(lambda x:x[0]**2,filter(lambda x:x[4]==k, data)))), \ 59 | float(sum(map(lambda x:x[1]**2,filter(lambda x:x[4]==k, data)))), \ 60 | float(sum(map(lambda x:x[0]*x[1],filter(lambda x:x[4]==k, data))))] \ 61 | for k in xrange(K)] 62 | mu_t = [[mu1_t[k][0]/len_k[k], mu1_t[k][1]/len_k[k]] for k in xrange(K)] 63 | sigma_t = [[[mu2_t[k][0]/len_k[k]-mu_t[k][0]**2,\ 64 | (mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1]],\ 65 | [(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1],\ 66 | mu2_t[k][1]/len_k[k]-mu_t[k][1]**2]] \ 67 | for k in xrange(K)] 68 | 69 | mu1_k_r = [[[float(sum(map(lambda x:x[2],filter(lambda x:x[4]==k and x[5]==r, data)))), \ 70 | float(sum(map(lambda x:x[3],filter(lambda x:x[4]==k and x[5]==r, data))))] \ 71 | for r in xrange(R)] for k in xrange(K)] 72 | mu2_k_r = [[[float(sum(map(lambda x:x[2]**2,filter(lambda x:x[4]==k and x[5]==r, data)))), \ 73 | float(sum(map(lambda x:x[3]**2,filter(lambda x:x[4]==k and x[5]==r, data)))), \ 74 | float(sum(map(lambda x:x[2]*x[3],filter(lambda x:x[4]==k and x[5]==r, data))))] \ 75 | for r in xrange(R)] for k in xrange(K)] 76 | mu_k_r = [[[mu1_k_r[k][r][0]/len_k_r[k][r], mu1_k_r[k][r][1]/len_k_r[k][r]] for r in xrange(R)] for k in xrange(K)] 77 | sigma_k_r = [[[[mu2_k_r[k][r][0]/len_k_r[k][r]-mu_k_r[k][r][0]**2,\ 78 | (mu2_k_r[k][r][2]-mu_k_r[k][r][0]*mu1_k_r[k][r][1]-mu_k_r[k][r][1]*mu1_k_r[k][r][0])/len_k_r[k][r]+mu_k_r[k][r][0]*mu_k_r[k][r][1]],\ 79 | [(mu2_k_r[k][r][2]-mu_k_r[k][r][0]*mu1_k_r[k][r][1]-mu_k_r[k][r][1]*mu1_k_r[k][r][0])/len_k_r[k][r]+mu_k_r[k][r][0]*mu_k_r[k][r][1],\ 80 | mu2_k_r[k][r][1]/len_k_r[k][r]-mu_k_r[k][r][1]**2]] \ 81 | for r in xrange(R)] for k in xrange(K)] 82 | 83 | # 迭代计算 84 | def find_index(X, a): 85 | for i in xrange(len(X)): 86 | for j in xrange(len(X[0])): 87 | if X[i][j] == a: 88 | return (i, j) 89 | 90 | for iter_curr in xrange(iter_num): 91 | likelihood = 0 92 | for i in xrange(L): 93 | item, old_k, old_r = data[i], data[i][4], data[i][5] 94 | probs = [[1.*len_k[k]/sum(len_k)*len_k_r[k][r]/sum(len_k_r[k])*\ 95 | gauss(item[0:2],mu_t[k],sigma_t[k])*\ 96 | gauss(item[2:4],mu_k_r[k][r],sigma_k_r[k][r]) \ 97 | for r in xrange(R)] for k in xrange(K)] 98 | prob_max = max([max(prob) for prob in probs]) 99 | likelihood += -math.log10(sum([sum(prob) for prob in probs])) 100 | new_k, new_r = find_index(probs, prob_max) 101 | data[i][4], data[i][5] = new_k, new_r 102 | len_k[old_k] -= 1; len_k[new_k] += 1 103 | len_k_r[old_k][old_r] -= 1; len_k_r[new_k][new_r] += 1 104 | mu1_t[old_k][0] -= item[0]; mu1_t[old_k][1] -= item[1] 105 | mu1_t[new_k][0] += item[0]; mu1_t[new_k][1] += item[1] 106 | mu2_t[old_k][0] -= item[0]**2; mu2_t[old_k][1] -= item[1]**2; mu2_t[old_k][2] -= item[0]*item[1] 107 | mu2_t[new_k][0] += item[0]**2; mu2_t[new_k][1] += item[1]**2; mu2_t[new_k][2] += item[0]*item[1] 108 | mu_t = [[mu1_t[k][0]/len_k[k], mu1_t[k][1]/len_k[k]] for k in xrange(K)] 109 | sigma_t = [[[mu2_t[k][0]/len_k[k]-mu_t[k][0]**2,\ 110 | (mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1]],\ 111 | [(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1],\ 112 | mu2_t[k][1]/len_k[k]-mu_t[k][1]**2]] \ 113 | for k in xrange(K)] 114 | mu1_k_r[old_k][old_r][0] -= item[2]; mu1_k_r[old_k][old_r][1] -= item[3] 115 | mu1_k_r[new_k][new_r][0] += item[2]; mu1_k_r[new_k][new_r][1] += item[3] 116 | mu2_k_r[old_k][old_r][0] -= item[2]**2; mu2_k_r[old_k][old_r][1] -= item[3]**2; mu2_k_r[old_k][old_r][2] -= item[2]*item[3]; 117 | mu2_k_r[new_k][new_r][0] += item[2]**2; mu2_k_r[new_k][new_r][1] += item[3]**2; mu2_k_r[new_k][new_r][2] += item[2]*item[3]; 118 | mu_k_r = [[[mu1_k_r[k][r][0]/len_k_r[k][r], mu1_k_r[k][r][1]/len_k_r[k][r]] for r in xrange(R)] for k in xrange(K)] 119 | sigma_k_r = [[[[mu2_k_r[k][r][0]/len_k_r[k][r]-mu_k_r[k][r][0]**2,\ 120 | (mu2_k_r[k][r][2]-mu_k_r[k][r][0]*mu1_k_r[k][r][1]-mu_k_r[k][r][1]*mu1_k_r[k][r][0])/len_k_r[k][r]+mu_k_r[k][r][0]*mu_k_r[k][r][1]],\ 121 | [(mu2_k_r[k][r][2]-mu_k_r[k][r][0]*mu1_k_r[k][r][1]-mu_k_r[k][r][1]*mu1_k_r[k][r][0])/len_k_r[k][r]+mu_k_r[k][r][0]*mu_k_r[k][r][1],\ 122 | mu2_k_r[k][r][1]/len_k_r[k][r]-mu_k_r[k][r][1]**2]] \ 123 | for r in xrange(R)] for k in xrange(K)] 124 | print iter_curr, likelihood, len_k 125 | 126 | if save: 127 | with open('model_save/iter_{0}.txt'.format(str(iter_curr).zfill(2)),'w') as f: 128 | f.write(json.dumps({"likelihood":likelihood, 129 | "len_k":len_k, 130 | "len_k_r":len_k_r, 131 | "mu_t":mu_t, 132 | "sigma_t":sigma_t, 133 | "mu_k_r":mu_k_r, 134 | "sigma_k_r":sigma_k_r})) 135 | 136 | def plot_distribution(iter_curr): 137 | from pylab import * 138 | 139 | param = json.loads(open('model_save/iter_{0}.txt'.format(iter_curr),'r').read()) 140 | len_k, len_k_r = param['len_k'], param['len_k_r'] 141 | mu_t, sigma_t = param['mu_t'], param['sigma_t'] 142 | mu_k_r, sigma_k_r = param['mu_k_r'], param['sigma_k_r'] 143 | 144 | # 时间分布 145 | plt.figure(figsize=(12,5)) 146 | norm1 = cm.colors.Normalize(vmax=0.0020, vmin=0) 147 | for c, k in enumerate([4,0,1,2,3]): 148 | matrix = [[0 for j in xrange(24*6)] for i in xrange(24*6)] 149 | for ts in xrange(24*6): 150 | for tf in xrange(24*6): 151 | matrix[ts][tf] = 1.*(len_k[k]/sum(len_k))*gauss([ts,tf],mu_t[k],sigma_t[k]) 152 | (X, Y), C = meshgrid(np.arange(24*6), np.arange(24*6)), np.array(matrix) 153 | subplot(2,5,1+c) 154 | cset = pcolormesh(X, Y, C.T, cmap=cm.get_cmap("OrRd"), norm=norm1) 155 | plt.axis([0, 24*6, 0, 24*6]) 156 | plt.xticks(np.linspace(0,24*6,7)) 157 | plt.yticks(np.linspace(0,24*6,7)) 158 | if c == 0: 159 | plt.xlabel('Session start time slot /10min') 160 | plt.ylabel('Session end time slot /10min') 161 | cax1 = axes([0.92, 0.54, 0.01, 0.35]) 162 | colorbar(cax=cax1) 163 | # plt.axis('off') 164 | 165 | # 空间分布 166 | subplots_adjust(hspace=0.4) 167 | norm2 = cm.colors.Normalize(vmax=0.0040, vmin=0) 168 | for c, k in enumerate([4,0,1,2,3]): 169 | matrix = [[0 for j in xrange(150)] for i in xrange(225)] 170 | for gx in xrange(225): 171 | for gy in xrange(150): 172 | matrix[gx][gy] = 1.*(len_k[k]/sum(len_k))*sum([1.*(len_k_r[k][r]/sum(len_k_r[k]))*gauss([gx,gy],mu_k_r[k][r],sigma_k_r[k][r]) for r in xrange(R)]) 173 | (X, Y), C = meshgrid(np.arange(100), np.arange(100)), np.array(matrix)[20:120,20:120] 174 | subplot(2,5,6+c) 175 | cset = pcolormesh(X, Y, C.T, cmap=cm.get_cmap("OrRd"), norm=norm2) 176 | plt.axis([0, 100-1, 0, 100-1]) 177 | plt.xticks(np.linspace(0,100,6)) 178 | plt.yticks(np.linspace(0,100,6)) 179 | if c == 0: 180 | plt.xlabel('Longitude grid index /200m') 181 | plt.ylabel('Latitude grid index /200m') 182 | subplots_adjust(bottom=0.1, left=0.06, right=0.9, top=0.9) 183 | cax2 = axes([0.92, 0.09, 0.01, 0.35]) 184 | colorbar(cax=cax2) 185 | # plt.axis('off') 186 | # show() 187 | for postfix in ('eps','png'): 188 | savefig('../figure/{0}/05.{0}'.format(postfix)) 189 | 190 | def plot_iteration_likelihood(): 191 | from pylab import * 192 | 193 | iterations, likelihoods = [], [] 194 | for iteration, filename in enumerate(sorted(glob.glob(r"model_save/iter_*.txt"))): 195 | likelihood = json.loads(open(filename,'r').read()).get("likelihood",0) 196 | iterations.append(iteration) 197 | likelihoods.append(likelihood/10**4) 198 | 199 | fig = plt.figure() 200 | ax1 = fig.add_subplot(111) 201 | plot(iterations, likelihoods, 'k-', label="Likelihood", linewidth=2) 202 | plt.xlabel('Number for iteration') 203 | plt.ylabel('$-10^{-4} \\times$ log likelihood') 204 | handles, labels = ax1.get_legend_handles_labels() 205 | ax1.legend(handles, labels) 206 | # show() 207 | for postfix in ('eps','png'): 208 | savefig('../figure/{0}/06.{0}'.format(postfix)) 209 | 210 | def compute_error(iter_curr): 211 | import numpy as np 212 | 213 | param = json.loads(open('model_save/iter_{0}.txt'.format(iter_curr),'r').read()) 214 | len_k, len_k_r = param['len_k'], param['len_k_r'] 215 | mu_t, sigma_t = param['mu_t'], param['sigma_t'] 216 | mu_k_r, sigma_k_r = param['mu_k_r'], param['sigma_k_r'] 217 | 218 | # 时间分布 219 | matrix1 = [[0 for j in xrange(24*6)] for i in xrange(24*6)] 220 | for line in fileinput.input("../data/stationary.txt"): 221 | st, ft = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]) 222 | matrix1[st][ft] += 1 223 | fileinput.close() 224 | matrix2 = [[0 for j in xrange(24*6)] for i in xrange(24*6)] 225 | for k in xrange(K): 226 | for st in xrange(24*6): 227 | for ft in xrange(24*6): 228 | matrix2[st][ft] += 1.*(len_k[k]/sum(len_k))*gauss([st,ft],mu_t[k],sigma_t[k]) 229 | matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum() 230 | matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum() 231 | print "Temporal reconstruction accuracy:", 1-(abs(matrix1-matrix2)[40:60,100:140].sum()+abs(matrix1-matrix2)[100:140,40:60].sum()) 232 | 233 | # 空间分布 234 | matrix1 = [[0 for j in xrange(150)] for i in xrange(225)] 235 | for line in fileinput.input("../data/stationary.txt"): 236 | gx, gy = int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6]) 237 | matrix1[gx][gy] += 1 238 | fileinput.close() 239 | matrix2 = [[0 for j in xrange(150)] for i in xrange(225)] 240 | for k in xrange(K): 241 | for r in xrange(R): 242 | for gx in xrange(225): 243 | for gy in xrange(150): 244 | matrix2[gx][gy] += 1.*(len_k[k]/sum(len_k))*(len_k_r[k][r]/sum(len_k_r[k]))*gauss([gx,gy],mu_k_r[k][r],sigma_k_r[k][r]) 245 | matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum() 246 | matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum() 247 | print "Spatial reconstruction accuracy:", 1-abs(matrix1-matrix2)[50:90,50:90].sum() 248 | 249 | 250 | if __name__ == "__main__": 251 | # run_model(save=True) 252 | plot_distribution(19) 253 | # plot_iteration_likelihood() 254 | # compute_error(19) 255 | 256 | # Likelihood 257 | # model: 2636999 258 | # baseline1: 3336792 259 | # baseline2: 2644190 260 | 261 | # model: 262 | # Temporal reconstruction accuracy: 0.867786928604 263 | # Spatial reconstruction accuracy: 0.746619059294 264 | # baseline1: 265 | # Temporal reconstruction accuracy: 0.861796706542 266 | # Spatial reconstruction accuracy: 0.659659953592 267 | # baseline2: 268 | # Temporal reconstruction accuracy: 0.868461451694 269 | # Spatial reconstruction accuracy: 0.719261119801 270 | 271 | --------------------------------------------------------------------------------