├── figure
    ├── 01.png
    ├── 02.png
    ├── 03.png
    ├── 04.png
    ├── m1.png
    ├── t1.png
    ├── t2.png
    ├── t3.png
    ├── alg1.png
    ├── alg2.png
    ├── alg3.png
    └── system.png
├── code
    ├── examples_bayes
    │   ├── example_other
    │   │   ├── stan-reference-2.6.0.pdf
    │   │   ├── test_emcee.py
    │   │   └── test_pystan.py
    │   ├── example_pymc
    │   │   ├── ldamc.py
    │   │   └── disaster_model.py
    │   └── bayes_model.py
    ├── examples_sklearn
    │   ├── plot_gmm_pdf.py
    │   ├── plot_gmm.py
    │   ├── plot_gmm_sin.py
    │   ├── plot_gmm_selection.py
    │   ├── plot_gmm_classifier.py
    │   └── sk_gmm.py
    ├── extract_trace.py
    ├── model_baseline1.py
    ├── stationary_detection.py
    ├── model_baseline2.py
    ├── stationary_segmentation.py
    └── model_combine.py
└── README.md


/figure/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/01.png


--------------------------------------------------------------------------------
/figure/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/02.png


--------------------------------------------------------------------------------
/figure/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/03.png


--------------------------------------------------------------------------------
/figure/04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/04.png


--------------------------------------------------------------------------------
/figure/m1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/m1.png


--------------------------------------------------------------------------------
/figure/t1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/t1.png


--------------------------------------------------------------------------------
/figure/t2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/t2.png


--------------------------------------------------------------------------------
/figure/t3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/t3.png


--------------------------------------------------------------------------------
/figure/alg1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/alg1.png


--------------------------------------------------------------------------------
/figure/alg2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/alg2.png


--------------------------------------------------------------------------------
/figure/alg3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/alg3.png


--------------------------------------------------------------------------------
/figure/system.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/figure/system.png


--------------------------------------------------------------------------------
/code/examples_bayes/example_other/stan-reference-2.6.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/HEAD/code/examples_bayes/example_other/stan-reference-2.6.0.pdf


--------------------------------------------------------------------------------
/code/examples_bayes/example_other/test_emcee.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import emcee
 3 | 
 4 | def lnprob(x, ivar):
 5 | 	return -0.5 * np.sum(ivar * x ** 2)
 6 | 
 7 | ndim, nwalkers = 10, 100
 8 | ivar = 1. / np.random.rand(ndim)
 9 | p0 = [np.random.rand(ndim) for i in range(nwalkers)]
10 | sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=[ivar])
11 | sampler.run_mcmc(p0, 1000)
12 | 


--------------------------------------------------------------------------------
/code/examples_bayes/example_other/test_pystan.py:
--------------------------------------------------------------------------------
 1 | # import numpy as np
 2 | # import emcee
 3 | 
 4 | # def lnprob(x, ivar):
 5 | # 	return -0.5 * np.sum(ivar * x ** 2)
 6 | 
 7 | # ndim, nwalkers = 10, 100
 8 | # ivar = 1. / np.random.rand(ndim)
 9 | # p0 = [np.random.rand(ndim) for i in range(nwalkers)]
10 | # sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, args=[ivar])
11 | # sampler.run_mcmc(p0, 1000)
12 | 
13 | import pystan
14 | 
15 | schools_code = """
16 | data {
17 |     int<lower=0> J; // number of schools
18 |     real y[J]; // estimated treatment effects
19 |     real<lower=0> sigma[J]; // s.e. of effect estimates
20 | }
21 | parameters {
22 |     real mu;
23 |     real<lower=0> tau;
24 |     real eta[J];
25 | }
26 | transformed parameters {
27 |     real theta[J];
28 |     for (j in 1:J)
29 |     theta[j] <- mu + tau * eta[j];
30 | }
31 | model {
32 |     eta ~ normal(0, 1);
33 |     y ~ normal(theta, sigma);
34 | }
35 | """
36 | 
37 | schools_dat = {'J': 8,
38 |                'y': [28,  8, -3,  7, -1,  1, 18, 12],
39 |                'sigma': [15, 10, 16, 11,  9, 11, 10, 18]}
40 | 
41 | fit = pystan.stan(model_code=schools_code, data=schools_dat, iter=1000, chains=4)
42 | 


--------------------------------------------------------------------------------
/code/examples_bayes/example_pymc/ldamc.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import fileinput
 3 | import pymc as pm
 4 | import numpy as np
 5 | 
 6 | st = time.time()
 7 | K = 2 # number of topics
 8 | V = 3 # number of words
 9 | D = 3 # number of documents
10 | data = np.array([[1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], [2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
11 | alpha = np.ones(K)
12 | beta = np.ones(V)
13 | theta = pm.Container([pm.CompletedDirichlet("theta_%s" % i, pm.Dirichlet("ptheta_%s" % i, theta=alpha)) for i in range(D)])
14 | phi = pm.Container([pm.CompletedDirichlet("phi_%s" % k, pm.Dirichlet("pphi_%s" % k, theta=beta)) for k in range(K)])
15 | Wd = [len(doc) for doc in data]
16 | z = pm.Container([pm.Categorical('z_%i' % d, p=theta[d], size=Wd[d], value=np.random.randint(K,size=Wd[d])) for d in range(D)])
17 | w = pm.Container([pm.Categorical("w_%i_%i" % (d,i), p=pm.Lambda('phi_z_%i_%i' % (d,i), lambda z=z[d][i], phi=phi:phi[z]), value=data[d][i], observed=True) for d in range(D) for i in range(Wd[d])])
18 | model = pm.Model([theta, phi, z, w])
19 | mcmc = pm.MCMC(model)
20 | mcmc.sample(1000)
21 | ft = time.time()
22 | print ft-st
23 | print theta.value
24 | print phi.value


--------------------------------------------------------------------------------
/code/examples_sklearn/plot_gmm_pdf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from matplotlib.colors import LogNorm
 4 | from sklearn import mixture
 5 | 
 6 | n_samples = 300
 7 | # generate random sample, two components
 8 | np.random.seed(0)
 9 | # generate spherical data centered on (20, 20)
10 | shifted_gaussian = np.random.randn(n_samples, 2) + np.array([20, 20])
11 | # generate zero centered stretched Gaussian data
12 | C = np.array([[0., -0.7], [3.5, .7]])
13 | stretched_gaussian = np.dot(np.random.randn(n_samples, 2), C)
14 | # concatenate the two datasets into the final training set
15 | X_train = np.vstack([shifted_gaussian, stretched_gaussian])
16 | # fit a Gaussian Mixture Model with two components
17 | clf = mixture.GMM(n_components=2, covariance_type='full')
18 | clf.fit(X_train)
19 | 
20 | # display predicted scores by the model as a contour plot
21 | x = np.linspace(-20.0, 30.0)
22 | y = np.linspace(-20.0, 40.0)
23 | X, Y = np.meshgrid(x, y)
24 | XX = np.array([X.ravel(), Y.ravel()]).T
25 | Z = -clf.score_samples(XX)[0]
26 | Z = Z.reshape(X.shape)
27 | 
28 | CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0), levels=np.logspace(0, 3, 10))
29 | CB = plt.colorbar(CS, shrink=0.8, extend='both')
30 | plt.scatter(X_train[:, 0], X_train[:, 1], .8)
31 | 
32 | plt.title('Negative log-likelihood predicted by a GMM')
33 | plt.axis('tight')
34 | plt.show()
35 | 


--------------------------------------------------------------------------------
/code/examples_bayes/example_pymc/disaster_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A model for the disasters data with a changepoint
 3 | 
 4 | changepoint ~ U(0, 110)
 5 | early_mean ~ Exp(1.)
 6 | late_mean ~ Exp(1.)
 7 | disasters[t] ~ Po(early_mean if t <= switchpoint, late_mean otherwise)
 8 | 
 9 | """
10 | 
11 | from pymc import *
12 | from numpy import array, empty
13 | from numpy.random import randint
14 | 
15 | disasters_array = array([4, 5, 4, 0, 1, 4, 3, 4, 0, 6, 3, 3, 4, 0, 2, 6,
16 |                          3, 3, 5, 4, 5, 3, 1, 4, 4, 1, 5, 5, 3, 4, 2, 5,
17 |                          2, 2, 3, 4, 2, 1, 3, 2, 2, 1, 1, 1, 1, 3, 0, 0,
18 |                          1, 0, 1, 1, 0, 0, 3, 1, 0, 3, 2, 2, 0, 1, 1, 1,
19 |                          0, 1, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 0, 2,
20 |                          3, 3, 1, 1, 2, 1, 1, 1, 1, 2, 4, 2, 0, 0, 1, 4,
21 |                          0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])
22 | 
23 | # Define data and stochastics
24 | switchpoint = DiscreteUniform(
25 |     'switchpoint',
26 |     lower=0,
27 |     upper=110,
28 |     doc='Switchpoint[year]')
29 | 
30 | early_mean = Exponential('early_mean', beta=1.)
31 | late_mean = Exponential('late_mean', beta=1.)
32 | 
33 | @deterministic(plot=False)
34 | def rate(s=switchpoint, e=early_mean, l=late_mean):
35 |     ''' Concatenate Poisson means '''
36 |     out = empty(len(disasters_array))
37 |     out[:s] = e
38 |     out[s:] = l
39 |     return out
40 | 
41 | disasters = Poisson('disasters', mu=rate, value=disasters_array, observed=True)
42 | 
43 | # import disaster_model
44 | from pymc import MCMC
45 | # M = MCMC(disaster_model)
46 | M = MCMC([switchpoint,early_mean,late_mean,rate,disasters])
47 | M.sample(iter=10000, burn=1000, thin=10)
48 | print switchpoint.value
49 | print rate.value
50 | print M.trace('switchpoint')[:]
51 | # from pymc.Matplot import plot
52 | # plot(M)
53 | 


--------------------------------------------------------------------------------
/code/examples_sklearn/plot_gmm.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import numpy as np
 3 | from scipy import linalg
 4 | import matplotlib.pyplot as plt
 5 | import matplotlib as mpl
 6 | from sklearn import mixture
 7 | 
 8 | # Number of samples per component
 9 | n_samples = 500
10 | # Generate random sample, two components
11 | np.random.seed(0)
12 | C = np.array([[0., -0.1], [1.7, .4]])
13 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]
14 | # Fit a mixture of Gaussians with EM using five components
15 | gmm = mixture.GMM(n_components=5, covariance_type='full')
16 | gmm.fit(X)
17 | # Fit a Dirichlet process mixture of Gaussians using five components
18 | dpgmm = mixture.DPGMM(n_components=5, covariance_type='full')
19 | dpgmm.fit(X)
20 | 
21 | color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm'])
22 | for i, (clf, title) in enumerate([(gmm, 'GMM'), (dpgmm, 'Dirichlet Process GMM')]):
23 |     splot = plt.subplot(2, 1, 1 + i)
24 |     Y_ = clf.predict(X)
25 |     for i, (mean, covar, color) in enumerate(zip(clf.means_, clf._get_covars(), color_iter)):
26 |         v, w = linalg.eigh(covar)
27 |         u = w[0] / linalg.norm(w[0])
28 |         # as the DP will not use every component it has access to unless it needs it, we shouldn't plot the redundant components.
29 |         if not np.any(Y_ == i):
30 |             continue
31 |         plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
32 |         # Plot an ellipse to show the Gaussian component
33 |         angle = np.arctan(u[1] / u[0])
34 |         angle = 180 * angle / np.pi  # convert to degrees
35 |         ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color)
36 |         ell.set_clip_box(splot.bbox)
37 |         ell.set_alpha(0.5)
38 |         splot.add_artist(ell)
39 |     plt.xlim(-10, 10)
40 |     plt.ylim(-3, 6)
41 |     plt.xticks(())
42 |     plt.yticks(())
43 |     plt.title(title)
44 | 
45 | plt.show()
46 | 


--------------------------------------------------------------------------------
/code/extract_trace.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | from operator import add
 5 | from pyspark import SparkConf
 6 | from pyspark import SparkContext
 7 | 
 8 | def extract(line):
 9 | 	import time
10 | 	try:
11 | 		part = line.strip().replace('\"','').split(",")
12 | 		TTIME, LAC, CI, IMSI = part[1].split(" "), part[3], part[4], part[5]
13 | 		pt1, pt2, pt3 = TTIME[0].split("-"), TTIME[1].split("."), TTIME[2]
14 | 		year, month, day, hour, minute, second = int("20"+pt1[2]), {"AUG":8}[pt1[1]], int(pt1[0]), int(pt2[0]), int(pt2[1]), int(pt2[2])
15 | 		hour = hour if hour != 12 else 0
16 | 		hour = hour if pt3 == "AM" else hour+12
17 | 		secs = hour*3600+minute*60+second
18 | 		key = LAC+" "+CI
19 | 		sl = secs/(10*60)
20 | 		if bss.has_key(key):
21 | 			bs = bss[key]
22 | 			lng, lat = bs["lng"], bs["lat"]
23 | 			if 120.02<=lng<120.48 and 30.15<=lat<=30.42:
24 | 				gx, gy = int((lng-120.02)/(120.48-120.02)*225), int((lat-30.15)/(30.42-30.15)*150)
25 | 				return ((IMSI, sl), str(gx)+","+str(gy))
26 | 			else:
27 | 				return (("", -1), "")
28 | 		else:
29 | 			return (("", -1), "")
30 | 	except:
31 | 		return (("", -1), "")
32 | 
33 | global bss
34 | 
35 | if __name__ == "__main__":
36 | 	import fileinput
37 | 	bss = {}
38 | 	for line in fileinput.input("hz_base.txt"):
39 | 		part = line.strip().split(" ")
40 | 		num, lng, lat = part[1]+" "+part[2], float(part[3]), float(part[4])
41 | 		bss[num] = {"lng":lng, "lat":lat}
42 | 	fileinput.close()
43 | 	conf = SparkConf().setMaster('yarn-client') \
44 | 					  .setAppName('qiangsiwei') \
45 | 					  .set('spark.driver.maxResultSize', "8g")
46 | 	sc = SparkContext(conf = conf)
47 | 	filename = "0826"
48 | 	lines = sc.textFile("hdfs://namenode.omnilab.sjtu.edu.cn/user/qiangsiwei/hangzhou/original/{0}.csv".format(filename), 1)
49 | 	counts = lines.map(lambda x : extract(x)) \
50 | 				  .filter(lambda x : x[0][0]!="" and x[0][1]!=-1 and x[1]!="") \
51 | 				  .distinct() \
52 | 				  .groupByKey() \
53 | 				  .map(lambda x : (x[0][0],str(x[0][1])+":"+"-".join(sorted(x[1])))) \
54 | 				  .groupByKey() \
55 | 				  .map(lambda x : x[0]+"\t"+"|".join([str(it["sl"])+":"+it["gs"] for it in sorted([{"sl":int(line.split(":")[0]),"gs":line.split(":")[1]} for line in x[1]], key=lambda x:x["sl"])]))
56 | 	output = counts.saveAsTextFile("./hangzhou/SSTD/3G/{0}.csv".format(filename))
57 | 


--------------------------------------------------------------------------------
/code/examples_sklearn/plot_gmm_sin.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import numpy as np
 3 | from scipy import linalg
 4 | import matplotlib.pyplot as plt
 5 | import matplotlib as mpl
 6 | from sklearn import mixture
 7 | from sklearn.externals.six.moves import xrange
 8 | 
 9 | # Number of samples per component
10 | n_samples = 100
11 | # Generate random sample following a sine curve
12 | np.random.seed(0)
13 | X = np.zeros((n_samples, 2))
14 | step = 4 * np.pi / n_samples
15 | 
16 | for i in xrange(X.shape[0]):
17 |     x = i * step - 6
18 |     X[i, 0] = x + np.random.normal(0, 0.1)
19 |     X[i, 1] = 3 * (np.sin(x) + np.random.normal(0, .2))
20 | color_iter = itertools.cycle(['r', 'g', 'b', 'c', 'm'])
21 | 
22 | # print X
23 | # # est = mixture.GMM(n_components=10, covariance_type='full', n_iter=100)
24 | # est = mixture.DPGMM(n_components=10, covariance_type='spherical', alpha=100., n_iter=100)
25 | # est.fit(X)
26 | # print est.means_
27 | 
28 | for i, (clf, title) in enumerate([
29 | 		(mixture.GMM(n_components=10, covariance_type='spherical', n_iter=100), "Expectation-maximization"),
30 |         (mixture.GMM(n_components=10, covariance_type='diag', n_iter=100), "Expectation-maximization"),
31 |         (mixture.GMM(n_components=10, covariance_type='tied', n_iter=100), "Expectation-maximization"),
32 |         (mixture.GMM(n_components=10, covariance_type='full', n_iter=100), "Expectation-maximization")]):
33 |     clf.fit(X)
34 |     splot = plt.subplot(4, 1, 1 + i)
35 |     Y_ = clf.predict(X)
36 |     for i, (mean, covar, color) in enumerate(zip(clf.means_, clf._get_covars(), color_iter)):
37 |         print mean
38 |         v, w = linalg.eigh(covar)
39 |         u = w[0] / linalg.norm(w[0])
40 |         # as the DP will not use every component it has access to unless it needs it, we shouldn't plot the redundant components.
41 |         if not np.any(Y_ == i):
42 |             continue
43 |         plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
44 |         # Plot an ellipse to show the Gaussian component
45 |         angle = np.arctan(u[1] / u[0])
46 |         angle = 180 * angle / np.pi  # convert to degrees
47 |         ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color)
48 |         ell.set_clip_box(splot.bbox)
49 |         ell.set_alpha(0.5)
50 |         splot.add_artist(ell)
51 |     plt.xlim(-6, 4 * np.pi - 6)
52 |     plt.ylim(-5, 5)
53 |     plt.title(title)
54 |     plt.xticks(())
55 |     plt.yticks(())
56 |     print "----- ----- -----"
57 | 
58 | plt.show()
59 | 


--------------------------------------------------------------------------------
/code/examples_sklearn/plot_gmm_selection.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import numpy as np
 3 | from scipy import linalg
 4 | import matplotlib.pyplot as plt
 5 | import matplotlib as mpl
 6 | from sklearn import mixture
 7 | 
 8 | # Number of samples per component
 9 | n_samples = 500
10 | # Generate random sample, two components
11 | np.random.seed(0)
12 | C = np.array([[0., -0.1], [1.7, .4]])
13 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]
14 | 
15 | lowest_bic, bic = np.infty, []
16 | n_components_range = range(1, 7)
17 | cv_types = ['spherical', 'tied', 'diag', 'full']
18 | for cv_type in cv_types:
19 |     for n_components in n_components_range:
20 |         # Fit a mixture of Gaussians with EM
21 |         gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type)
22 |         gmm.fit(X)
23 |         bic.append(gmm.bic(X))
24 |         if bic[-1] < lowest_bic:
25 |             lowest_bic = bic[-1]
26 |             best_gmm = gmm
27 | 
28 | bic = np.array(bic)
29 | color_iter = itertools.cycle(['k', 'r', 'g', 'b', 'c', 'm', 'y'])
30 | clf, bars = best_gmm, []
31 | 
32 | # Plot the BIC scores
33 | spl = plt.subplot(2, 1, 1)
34 | for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
35 |     xpos = np.array(n_components_range) + .2 * (i - 2)
36 |     bars.append(plt.bar(xpos, bic[i * len(n_components_range):(i + 1) * len(n_components_range)], width=.2, color=color))
37 | plt.xticks(n_components_range)
38 | plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
39 | plt.title('BIC score per model')
40 | xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 + .2 * np.floor(bic.argmin() / len(n_components_range))
41 | plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
42 | spl.set_xlabel('Number of components')
43 | spl.legend([b[0] for b in bars], cv_types)
44 | 
45 | # Plot the winner
46 | splot = plt.subplot(2, 1, 2)
47 | Y_ = clf.predict(X)
48 | for i, (mean, covar, color) in enumerate(zip(clf.means_, clf.covars_, color_iter)):
49 |     v, w = linalg.eigh(covar)
50 |     if not np.any(Y_ == i):
51 |         continue
52 |     plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
53 |     # Plot an ellipse to show the Gaussian component
54 |     angle = np.arctan2(w[0][1], w[0][0])
55 |     angle = 180 * angle / np.pi  # convert to degrees
56 |     v *= 4
57 |     ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color)
58 |     ell.set_clip_box(splot.bbox)
59 |     ell.set_alpha(.5)
60 |     splot.add_artist(ell)
61 | 
62 | plt.xlim(-10, 10)
63 | plt.ylim(-3, 6)
64 | plt.xticks(())
65 | plt.yticks(())
66 | plt.title('Selected GMM: full model, 2 components')
67 | plt.subplots_adjust(hspace=.35, bottom=.02)
68 | plt.show()
69 | 


--------------------------------------------------------------------------------
/code/examples_sklearn/plot_gmm_classifier.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import matplotlib as mpl
 3 | import numpy as np
 4 | 
 5 | from sklearn import datasets
 6 | from sklearn.cross_validation import StratifiedKFold
 7 | from sklearn.externals.six.moves import xrange
 8 | from sklearn.mixture import GMM
 9 | 
10 | def make_ellipses(gmm, ax):
11 |     for n, color in enumerate('rgb'):
12 |         v, w = np.linalg.eigh(gmm._get_covars()[n][:2, :2])
13 |         u = w[0] / np.linalg.norm(w[0])
14 |         angle = np.arctan2(u[1], u[0])
15 |         angle = 180 * angle / np.pi
16 |         v *= 9
17 |         ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1], 180 + angle, color=color)
18 |         ell.set_clip_box(ax.bbox)
19 |         ell.set_alpha(0.5)
20 |         ax.add_artist(ell)
21 | 
22 | iris = datasets.load_iris()
23 | skf = StratifiedKFold(iris.target, n_folds=4)
24 | train_index, test_index = next(iter(skf))
25 | 
26 | X_train = iris.data[train_index]
27 | y_train = iris.target[train_index]
28 | X_test = iris.data[test_index]
29 | y_test = iris.target[test_index]
30 | n_classes = len(np.unique(y_train))
31 | 
32 | # Try GMMs using different types of covariances.
33 | classifiers = dict((covar_type, GMM(n_components=n_classes, covariance_type=covar_type, init_params='wc', n_iter=20)) for covar_type in ['spherical', 'diag', 'tied', 'full'])
34 | n_classifiers = len(classifiers)
35 | plt.figure(figsize=(3 * n_classifiers / 2, 6))
36 | plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99)
37 | 
38 | for index, (name, classifier) in enumerate(classifiers.items()):
39 |     classifier.means_ = np.array([X_train[y_train == i].mean(axis=0) for i in xrange(n_classes)])
40 |     classifier.fit(X_train)
41 |     h = plt.subplot(2, n_classifiers / 2, index + 1)
42 |     make_ellipses(classifier, h)
43 |     for n, color in enumerate('rgb'):
44 |         data = iris.data[iris.target == n]
45 |         plt.scatter(data[:, 0], data[:, 1], 0.8, color=color, label=iris.target_names[n])
46 |     for n, color in enumerate('rgb'):
47 |         data = X_test[y_test == n]
48 |         plt.plot(data[:, 0], data[:, 1], 'x', color=color)
49 |     y_train_pred = classifier.predict(X_train)
50 |     train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
51 |     plt.text(0.05, 0.9, 'Train accuracy: %.1f' % train_accuracy, transform=h.transAxes)
52 |     y_test_pred = classifier.predict(X_test)
53 |     test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100
54 |     plt.text(0.05, 0.8, 'Test accuracy: %.1f' % test_accuracy, transform=h.transAxes)
55 |     plt.xticks(())
56 |     plt.yticks(())
57 |     plt.title(name)
58 | 
59 | plt.legend(loc='lower right', prop=dict(size=12))
60 | plt.show()
61 | 


--------------------------------------------------------------------------------
/code/model_baseline1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | import glob
  4 | import math
  5 | import json
  6 | import random
  7 | import fileinput
  8 | 
  9 | # 时间粒度为10分钟
 10 | # 空间粒度为200米
 11 | # 20-24为工作日
 12 | 
 13 | def euclidean(p1, p2):
 14 |     return ((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)**0.5
 15 | 
 16 | def gauss(y, mu, sigma):
 17 |     # print "gauss computation:", y, mu, sigma
 18 |     return 1./math.sqrt((sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0]))*math.exp(-0.5/(sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0])*(sigma[0][0]*(y[0]-mu[0])**2-(y[0]-mu[0])*(y[1]-mu[1])*(sigma[0][1]+sigma[1][0])+sigma[1][1]*(y[1]-mu[1])**2))
 19 | # print gauss([1,1],[0,0],[[1,0],[0,1]])
 20 | 
 21 | K, R, data = 5, 20, []
 22 | 
 23 | # GMM
 24 | def run_baseline1():
 25 |     import numpy as np
 26 |     from sklearn import mixture
 27 | 
 28 |     for line in fileinput.input("../data/stationary.txt"):
 29 |         st, ft, gx, gy = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]), int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6])
 30 |         data.append([st,ft,gx,gy])
 31 |     fileinput.close()
 32 | 
 33 |     likelihood, param = 0, {}
 34 |     gmm_temporal = mixture.GMM(covariance_type="full",n_components=K)
 35 |     gmm_temporal.fit(np.array([sample[:2] for sample in data]))
 36 |     data_spatial = [[] for k in xrange(K)]
 37 |     param['gmm_temporal_weights'] = gmm_temporal.weights_.tolist()
 38 |     param['gmm_temporal_means'] = gmm_temporal.means_.tolist()
 39 |     param['gmm_temporal_covars'] = gmm_temporal.covars_.tolist()
 40 |     param['gmm_spatial'] = []
 41 |     for i, k in enumerate(gmm_temporal.predict([sample[:2] for sample in data])):
 42 |         data_spatial[k].append(data[i])
 43 |     for k in xrange(K):
 44 |         print '-'*10, k, '-'*10
 45 |         gmm_spatial = mixture.GMM(covariance_type="full",n_components=R)
 46 |         gmm_spatial.fit(np.array([sample[2:] for sample in data_spatial[k]]))
 47 |         param['gmm_spatial'].append({
 48 |             'weights': gmm_spatial.weights_.tolist(),
 49 |             'means': gmm_spatial.means_.tolist(),
 50 |             'covars': gmm_spatial.covars_.tolist()
 51 |         })
 52 |         for j, r in enumerate(gmm_spatial.predict([sample[2:] for sample in data_spatial[k]])):
 53 |             prob = 1.*gmm_temporal.weights_[k]*gmm_spatial.weights_[r]*\
 54 |                     gauss(data_spatial[k][j][:2],gmm_temporal.means_[k],gmm_temporal.covars_[k])*\
 55 |                     gauss(data_spatial[k][j][2:],gmm_spatial.means_[r],gmm_spatial.covars_[r])
 56 |             likelihood += -math.log10(prob)
 57 |     print likelihood
 58 | 
 59 |     with open('model_save/baseline1.txt','w') as f:
 60 |         f.write(json.dumps(param))
 61 | 
 62 | 
 63 | def compute_error():
 64 |     import numpy as np
 65 | 
 66 |     param = json.loads(open('model_save/baseline1.txt','r').read())
 67 |     gmm_temporal_weights = param['gmm_temporal_weights']
 68 |     gmm_temporal_means = param['gmm_temporal_means']
 69 |     gmm_temporal_covars = param['gmm_temporal_covars']
 70 |     gmm_spatial = param['gmm_spatial']
 71 |     
 72 |     # 时间分布
 73 |     matrix1 = [[0 for j in xrange(24*6)] for i in xrange(24*6)]
 74 |     for line in fileinput.input("../data/stationary.txt"):
 75 |         st, ft = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4])
 76 |         matrix1[st][ft] += 1
 77 |     fileinput.close()
 78 |     matrix2 = [[0 for j in xrange(24*6)] for i in xrange(24*6)]
 79 |     for k in xrange(K):
 80 |         for st in xrange(24*6):
 81 |             for ft in xrange(24*6):
 82 |                 matrix2[st][ft] += 1.*gmm_temporal_weights[k]*gauss([st,ft],gmm_temporal_means[k],gmm_temporal_covars[k])
 83 |     matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum()
 84 |     matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum()
 85 |     print "Temporal reconstruction accuracy:", 1-(abs(matrix1-matrix2)[40:60,100:140].sum()+abs(matrix1-matrix2)[100:140,40:60].sum())
 86 | 
 87 |     # 空间分布
 88 |     matrix1 = [[0 for j in xrange(150)] for i in xrange(225)]
 89 |     for line in fileinput.input("../data/stationary.txt"):
 90 |         gx, gy = int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6])
 91 |         matrix1[gx][gy] += 1
 92 |     fileinput.close()
 93 |     matrix2 = [[0 for j in xrange(150)] for i in xrange(225)]
 94 |     for k in xrange(K):
 95 |         for r in xrange(R):
 96 |             for gx in xrange(225):
 97 |                 for gy in xrange(150):
 98 |                     matrix2[gx][gy] += 1.*gmm_temporal_weights[k]*gmm_spatial[k]['weights'][r]*gauss([gx,gy],gmm_spatial[k]['means'][r],gmm_spatial[k]['covars'][r])
 99 |     matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum()
100 |     matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum()
101 |     print "Spatial reconstruction accuracy:", 1-abs(matrix1-matrix2)[50:90,50:90].sum()
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     # run_baseline1()
106 |     compute_error()
107 | 


--------------------------------------------------------------------------------
/code/stationary_detection.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | import gzip
  4 | import fileinput
  5 | import numpy as np
  6 | from pylab import *
  7 | 
  8 | # 时间粒度为10分钟
  9 | # 空间粒度为200米
 10 | # 20-24为工作日
 11 | 
 12 | def euclidean(p1, p2):
 13 | 	return 200*((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)**0.5
 14 | 
 15 | # 轨迹连接
 16 | def trajectory_concat():
 17 | 	users = {}
 18 | 	for day, filename in enumerate(['0820','0821','0822','0823','0824']):
 19 | 		print filename
 20 | 		for line in gzip.open("../data/3G/{0}.txt.gz".format(filename)):
 21 | 			uid, slices = line.strip().split("\t")
 22 | 			slices = ["{0}:{1}".format(int(sl.split(":")[0])+day*6*24,sl) 
 23 | 						for sl in slices.split("|")]
 24 | 			users[uid] = users.get(uid,[])
 25 | 			users[uid].extend(slices)
 26 | 	with open("../data/trace.txt","w") as f:
 27 | 		for uid, slices in users.iteritems():
 28 | 			f.write("{0}\t{1}\n".format(uid,"|".join(slices)))
 29 | 
 30 | # 阈值距离为1000米，时间为1小时，仅筛选出工作日
 31 | def stationary_detection():
 32 | 	min_distance, min_duration, min_session = 1000, 1*60/10, 10
 33 | 	with open("../data/stationary.txt", "w") as f:
 34 | 		# line_num = 0
 35 | 		for line in gzip.open("../data/trace.txt.gz"):
 36 | 			# line_num += 1
 37 | 			# print line_num
 38 | 			uid = line.strip().split("\t")[0]
 39 | 			session_list, session_current, slices = [], [], [(int(sl.split(":")[0]), \
 40 | 															  int(sl.split(":")[1]), \
 41 | 															  sum([int(p.split(",")[0]) for p in sl.split(":")[2].split("-")])/len(sl.split(":")[2].split("-")), \
 42 | 															  sum([int(p.split(",")[1]) for p in sl.split(":")[2].split("-")])/len(sl.split(":")[2].split("-"))) \
 43 | 																for sl in line.strip().split("\t")[1].split("|")]
 44 | 			for sl in slices:
 45 | 				if len(session_current) == 0:
 46 | 					session_current = [sl]
 47 | 				else:
 48 | 					if euclidean(sl[2:],session_current[-1][2:]) >= min_distance:
 49 | 						if session_current[-1][0]-session_current[0][0] >= min_duration:
 50 | 							session_list.append(session_current)
 51 | 						session_current = [sl]
 52 | 					else:
 53 | 						session_current.append(sl)
 54 | 			if session_current[-1][0]-session_current[0][0] >= min_duration:
 55 | 				session_list.append(session_current)
 56 | 			if len(session_list) >= min_session:
 57 | 				for i in range(1,len(session_list)-1):
 58 | 					if len(session_list[i]) >= 2 and 1*24*60/10 < session_list[i][-1][0] and session_list[i][0][0] < 6*24*60/10:
 59 | 						f.write(uid+" "+str(round(float(session_list[i][0][0]%(24*60/10))/(60/10),2))+" "+\
 60 | 										str(round(float(session_list[i][-1][0]%(24*60/10))/(60/10),2))+" "+\
 61 | 										str(session_list[i][0][0]%(24*60/10))+" "+\
 62 | 										str(session_list[i][-1][0]%(24*60/10))+" "+\
 63 | 										str(sum([session[2] for session in session_list[i]])/len(session_list[i]))+" "+\
 64 | 										str(sum([session[3] for session in session_list[i]])/len(session_list[i]))+"\n") 
 65 | 
 66 | def stationary_statistic():
 67 | 	matrix = [[0 for j in xrange(24*6)] for i in xrange(24*6)]
 68 | 	for line in fileinput.input("../data/stationary.txt"):
 69 | 		matrix[int(line.strip().split(" ")[3])][int(line.strip().split(" ")[4])] += 1
 70 | 	fileinput.close()
 71 | 	(X, Y), C = meshgrid(np.arange(24*6), np.arange(24*6)), np.array(matrix)
 72 | 	# 时间分布
 73 | 	subplot(1,1,1)
 74 | 	cset = pcolormesh(X, Y, C.T, cmap=cm.get_cmap("OrRd"))
 75 | 	plt.axis([0, 24*6-1, 0, 24*6-1])
 76 | 	colorbar(cset)
 77 | 	plt.xlabel('Session entering time slot /10min')
 78 | 	plt.ylabel('Session leaving time slot /10min')
 79 | 	# show()
 80 | 	for postfix in ('eps','png'):
 81 | 		savefig('../figure/{0}/01.{0}'.format(postfix))
 82 | 
 83 | 	matrix1, matrix2 = [[0 for j in xrange(150)] for i in xrange(225)], [[0 for j in xrange(150)] for i in xrange(225)]
 84 | 	for line in fileinput.input("../data/stationary.txt"):
 85 | 		ts, tf, gx, gy = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]), int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6])
 86 | 		d1, d2 = ((ts-50)**2+(tf-110)**2)**(1.0/2), ((ts-110)**2+(tf-50)**2)**(1.0/2)
 87 | 		if d1 <= d2:
 88 | 			matrix1[gx][gy] += 1
 89 | 		else:
 90 | 			matrix2[gx][gy] += 1
 91 | 	fileinput.close()
 92 | 	(X, Y), C1, C2 = meshgrid(np.arange(100), np.arange(100)), np.array(matrix1)[20:120,20:120], np.array(matrix2)[20:120,20:120]
 93 | 	# 空间分布
 94 | 	plt.figure(figsize=(12,5))
 95 | 	plt.subplots_adjust(left=0.05,right=1.00)
 96 | 	subplot(1,2,1)
 97 | 	cset1 = pcolormesh(X, Y, C1.T, cmap=cm.get_cmap("OrRd"))
 98 | 	plt.axis([0, 100-1, 0, 100-1])
 99 | 	colorbar(cset1)
100 | 	plt.xlabel('Longitude grid index /200m')
101 | 	plt.ylabel('Latitude grid index /200m')
102 | 	plt.title('Diurnal')
103 | 	subplot(1,2,2)
104 | 	cset2 = pcolormesh(X, Y, C2.T, cmap=cm.get_cmap("OrRd"))
105 | 	plt.axis([0, 100-1, 0, 100-1])
106 | 	colorbar(cset2)
107 | 	plt.title('Nocturnal')
108 | 	# show()
109 | 	for postfix in ('eps','png'):
110 | 		savefig('../figure/{0}/02.{0}'.format(postfix))
111 | 
112 | 
113 | if __name__ == "__main__":
114 | 	# trajectory_concat()
115 | 	# stationary_detection()
116 | 	stationary_statistic()
117 | 
118 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 群体驻留时空模式挖掘
 2 | =============
 3 | 
 4 | 本项目所采用的数据采集于杭州移动3G网络。针对城市群体人口轨迹的挖掘，已有工作常聚焦于研究移动行为，例如通勤行为等，固然，城市通勤行为能够直接反应出城市动态特性，与城市区域功能、人类行为模式等密切相关，但另一方面，人口驻留行为，例如不同区域在单日或周内不同时间段内驻留的人口密度、停留时长、驻留时间段的起止时刻等都是研究城市区域功能以及人类行为模式的良好特征。城市大尺度下驻留模式研究一方面受限于数据源的限制，为了获取用户的驻留行为，一般需要获取用户完整的移动轨迹，常用的调查问卷、手机通话、车载GPS以及社交网络签到数据不能很好的满足需求，同时，庞大的城市居民数量对数据的存储和分析也提出了更好的要求。本项目针对群体驻留时空模式进行挖掘，首先从群体轨迹中提取出驻留片段，之后基于层级贝叶斯模型使用无监督聚类的方法自动发现城市人口的驻留模式，层级贝叶斯模型相对于已有的时空聚类方法，包括主成分分析、隐含主题模型等均具有一定优势。
 5 | 
 6 | 数据采集与数据集
 7 | ----
 8 | 
 9 | 本项目所采用的数据集包含一周内连续五天工作日的移动用户上网基站定位数据，采用用户识别码（IMSI）来区分不同用户，并将基站的位置区编码（LAC）联合小区标识（CI）同基站位置数据进行关联转换成为经纬度坐标，结合HTTP请求对应的时间戳即得到用户轨迹。经统计，数据集基本情况如表所示。
10 | 
11 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/t1.png)
12 | 
13 | 如图是群体驻留时空模式挖掘系统框架。主要包含数据准备、数据挖掘与分析等过程，数据准备包括移动网络日志清洗、基站经纬度映射、轨迹点提取、移动轨迹提取，数据挖掘与分析包括从移动轨迹中抽取驻留片段，并对驻留片段起止时刻进行估计，以及对驻留片段进行时空聚类，基于对时空模式的分析，能够进一步对功能区域进行推测与识别，或对用户轨迹进行语义标注，理解用户的出行目的。
14 | 
15 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/system.png)
16 | 
17 | 驻留轨迹段提取
18 | ----
19 | 
20 | 驻留轨迹段的提取分为两个步骤，首先抽取出用户停留的时间和地点（简称为驻留轨迹段提取），之后对用户的到达和离开时刻进行估计。驻留轨迹段的提取方法如算法所示，输入用户轨迹点，输出驻留片段，基于预设的时间间隔以及空间间隔阈值timeThres和distThres，按照时间序列依次计算出每个轨迹点与初始轨迹点之间的时间间隔和距离，直到首次距离超过空间间隔阈值distThres，此时如果时间间隔超过时间间隔阈值timeThres，则检测到一次驻留行为。
21 | 
22 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/alg1.png)
23 | 
24 | 在检测出驻留行为后，需要对驻留的起止时刻te和tl进行估计，减小时空模式挖掘结果的误差。本项目基于用户固有模式、区域固有模式、全局模式对起止时刻进行估计，一般而言，用户根据自身生活规律，例如固定的上、下班时间，位置迁移具有一定模式，单天观测时，用户轨迹点具有稀疏性，但针对不同天，随着用户上网需求变化，通常可以检测到较为连续的时间段内的用户位置的变化情况，因此，基于多天数据有助于对用户位置变化的时刻进行估计。同样，不同区域也具有其固有模式，例如同一公司员工上、下班时刻趋于一致。在更大的尺度上，即城市整体也具有较为固定的通勤模式，例如早、晚高峰。因此，本项目基于三种模式对用户驻留轨迹段的起止时刻采用加权最小二乘法进行估计。
25 | 
26 | 如图是用户原始驻留轨迹段的示例与统计。如左图所示，是以10分钟为时间粒度的部分用户的轨迹段抽取结果，橙色和蓝色部分分别对应白天和夜间，由此可见，大多数用户都存在其固有模式，不同用户的状态转移也存在一定的全局相似性。如右图所示，是抽取出的驻留轨迹段之间时间间隔对应的累计概率分布函数，由此可见，大多数驻留轨迹段之间时间间隔较长，因此需要进行起止时刻估计。
27 | 
28 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/01.png)
29 | 
30 | 为了验证方法的有效性，共对比了两种基本方法，第一种是边值估计法，在提取出驻留轨迹后不做任何处理，第二种是中值估计，即假设用户离开地点A和到达地点B的时刻均为(t1+t2)/2，其中，t1、t2分别是最后一次观测到用户在地点A以及首次观测到用户在地点B的时刻。用于实验验证的数据来源于状态转移过程观测时间间隔较短，能够进行准确推测的部分数据。实验结果显示，本项目所提出的基于联合概率的估计方法具有最小的估计误差。
31 | 
32 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/t2.png)
33 | 
34 | 驻留行为时空聚类
35 | ----
36 | 
37 | 基于所提取出的群体驻留轨迹段，可以采用基于无监督的时空聚类的方法，自动发现城市人口的驻留行为模式。首先介绍本项目所提出的基于层级贝叶斯时空联合聚类模型的生成过程。基于之前提取出的群体驻留轨迹段，假设数据集对应的群体数量为N，每个用户包含的驻留轨迹片段数量为P，每个驻留轨迹段对应的隐含状态为s，不同的隐含状态可能携带了不同的语义信息，例如“工作”、“居家”、“娱乐”、“购物”等，每个驻留轨迹段的观测变量包括，驻留轨迹段的起止时刻te、tl以及空间位置l，由于空间位置与城市功能区域属性息息相关，因此临近的空间位置可能表达了同样的区域功能语义，反映了相似的用户出行的目的，因此可以假设实际观测到的空间位置l由该功能区域r的分布采样而来。
38 | 
39 | 如图是本节模型的图表示（Graphical representation）。如图所示，深色节点表示观测变量或先验变量，浅色节点表示隐含变量。观测变量包括驻留轨迹段的起止时刻te、tl以及空间位置l，隐含变量包括轨迹段所对应的隐含状态s，空间位置l对应的功能区域r，并假设驻留轨迹段的起止时刻te、tl服从联合高斯分布，同时假设空间位置l服从混合高斯分布，混合高斯分布由多个高斯分布叠加而成，分量的选择服从多项分布，功能区域r可以视作分量的选择。
40 | 
41 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/m1.png)
42 | 
43 | 模型的生成过程如算法所示。
44 | 
45 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/alg2.png)
46 | 
47 | 在模型推理过程中采用折叠吉布斯采样（Collapsed Gibbs sampling）基于迭代过程对模型参数进行优化。吉布斯采样是一种马尔可夫链蒙特卡罗（Markov chain Monte Carlo/MCMC）算法，在基于指定多元概率分布直接进行采样较为困难时，可以通过采用吉布斯采样获得观测序列进行近似，经常用于贝叶斯推断（Bayesian inference）。折叠吉布斯采样在吉布斯采样的基础上，通过积分避开了实际待估计的参数，转而对隐含变量和观测变量进行采样，并通过积分在统计观测变量的取值频次后对实际待估计的参数进行估计。针对本项目模型中所含有的联合高斯分布和混合高斯分布，采用了折叠吉布斯采样和最大期望算法（Expectation Maximization Algorithm/EM）相互结合的方法对模型进行推理。最大期望算法是一种概率模型，能够通过最大似然估计或者最大后验概率估计对模型的参数进行优化，其优势在于能够对无法观测的隐藏变量（Latent variable）进行建模。最大期望算法基于迭代过程进行计算，每轮迭代主要包含两个步骤，即E步和M步，交替进行计算。在E步中，基于对隐藏变量上一轮迭代得到的估计值，计算出最大似然估计值。在M步中，通过求导数或偏导数的方法，求得最大化似然估计值时模型的参数。
48 | 
49 | 模型的推断过程如算法所示。
50 | 
51 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/alg3.png)
52 | 
53 | 如图是针对提取出的驻留轨迹段（已经过起止时刻估计）的起止时刻的联合分布的统计结果，概率分布主要位于三个区域，即图的对角线以及关于对角线对称的两个区域，靠近对角线的左上角区域对应了较短的状态转移时间，囊括了各种情况下的短暂停留行为，对角线对称的两个区域所对应的起止时间趋近于上午9点至下午6点以及晚上7点至次日8点，分别对应了普遍的上班状态和居家状态，并且居家时间的开始时间更为分散，体现了不同群体行为的差异性。
54 | 
55 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/02.png)
56 | 
57 | 如图针对上图对角线对称的两个区域，统计出了对应的空间分布。如图所示，上班状态（Diurnal）和居家状态（Nocturnal）对应的驻留轨迹段的空间分布差异明显，上班状态对应的空间区域更倾向于城市的中心区域，而居家状态对应的空间区域在城市范围内较为分散。
58 | 
59 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/03.png)
60 | 
61 | 为了说明层级贝叶斯时空联合聚类模型的优越性，共对比了另外两种模型，其中一种是层级贝叶斯时空次序聚类模型，另一种是高斯混合时空次序聚类模型。高斯混合时空次序聚类模型首先基于驻留轨迹段的起止时刻联合分布进行聚类，之后针对时间维度聚类结果得到的每个类，再基于其空间分布进行聚类。实验结果显示，层级贝叶斯时空联合聚类模型优于用于对比的另外两种模型，层级贝叶斯时空联合聚类在使用更少参数的同时达到了更小的负对数似然，同时时间和空间分布的重构准确率也均较高，这是因为该模型对驻留轨迹段的时间和空间分布同时进行优化，而另外两种模型均按照时间、空间的次序进行优化，在时间维度上虽然达到了最优，但时间聚类的结果限制了在空间维度上所能进行优化的极限。
62 | 
63 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/t3.png)
64 | 
65 | 如图是基于层级贝叶斯时空联合聚类模型的聚类结果。其中，第一行对应的是模式在时间上的概率分布，第二行对应的是模式在空间上的概率分布，一共有五列，每一列都对应了一种不同的行为模式，时间模式子图的横、纵坐标分别为驻留轨迹段的起止时刻，空间模式子图的横、纵坐标分别为根据经纬度坐标转换得到的地理空间网格坐标。由图可见，城市人口行为模式主要可以分为五种模式，分别对应了不同的语义。由图从左至右的五种模式中，模式1的起止时刻分布在时间模式子图的对角线附近，空间位置分布在城市区域中心，可能对应了各种情况下的短暂停留；模式2及模式3的起止时刻分布在时间模式子图对角线的对称区域，空间分布呈现出互补的特性，一者分布在城市中心区域，一者在城市区域中分布较为均匀，分别可推测带有“白天工作”以及“夜间在家”的语义；模式3的起止时刻分布在时间模式子图对角线的右上端处，对应时间为晚上7点至11点，空间分布偏向于城市中心特定区域，可推测带有“夜间休闲娱乐”的语义；模式5对应的行为发生概率最少，起止时刻分布在时间模式子图的对角线的右上端处，但与模式4不同，模式5的起止时刻近似相同，空间分布较为弥散，可以推测为用户位置在天内固定，这可能是由于用户工作地点与居住地点非常临近。如图是在预设挖掘出五种模式的基础上计算得到的结果，如果进一步增加待挖掘出的模式的数量，还可以进一步获得更加细粒度的行为模式。
66 | 
67 | ![Alt Text](https://raw.githubusercontent.com/qiangsiwei/hangzhou_SSTD/master/figure/04.png)
68 | 


--------------------------------------------------------------------------------
/code/examples_bayes/bayes_model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | import fileinput
  4 | import pymc as pm
  5 | import numpy as np
  6 | 
  7 | data_tp, data_sp = [], []
  8 | for line in fileinput.input("../../data/stationary.txt"):
  9 | 	part = line.strip().split("\t")
 10 | 	uid, items = part[0], part[1:]
 11 | 	if uid == "460029901722027":
 12 | 		for item in items:
 13 | 			tm, poi = [int(i) for i in item.split(" ")[0:2]], [int(i) for i in item.split(" ")[4].split(",")]
 14 | 			data_tp.append(tm)
 15 | 			data_sp.append(poi)
 16 | fileinput.close()
 17 | data_tp, data_sp = np.array(data_tp), np.array(data_sp)
 18 | print data_tp
 19 | print data_sp
 20 | 
 21 | prior = pm.Dirichlet('prior', np.array([50.0,50.0]))
 22 | state = pm.Container([pm.Categorical('state_%i' % i, p=prior) for i in range(len(data_tp))])
 23 | stime = pm.Container([pm.DiscreteUniform('stime_%i' % i, lower=0, upper=23) for i in range(2)])
 24 | ftime = pm.Container([pm.DiscreteUniform('ftime_%i' % i, lower=0, upper=23) for i in range(2)])
 25 | @pm.deterministic(plot=False)
 26 | def mu_s(state=state, stime=stime):
 27 | 	return np.array([stime[0] if state[i] == 0 else stime[1] for i in xrange(len(data_tp))])
 28 | @pm.deterministic(plot=False)
 29 | def mu_f(state=state, stime=ftime):
 30 | 	return np.array([ftime[0] if state[i] == 0 else ftime[1] for i in xrange(len(data_tp))])
 31 | obs_s = pm.Normal('obs_s', mu=mu_s, tau=0.1, value=data_tp[:,0], observed=True)
 32 | obs_f = pm.Normal('obs_f', mu=mu_f, tau=0.1, value=data_tp[:,1], observed=True)
 33 | model = pm.Model([prior, state, stime, ftime, obs_s, obs_f])
 34 | mcmc = pm.MCMC(model)
 35 | mcmc.sample(100)
 36 | print state.value
 37 | print stime[0].value, ftime[0].value
 38 | print stime[1].value, ftime[1].value
 39 | 
 40 | # prior = pm.Dirichlet('prior', np.array([50.0,50.0]))
 41 | # state = pm.Container([pm.Categorical('state_%i' % i, p=prior) for i in range(len(data_sp))])
 42 | # poi_1 = pm.Container([pm.DiscreteUniform('poi_1_%i' % i, lower=0, upper=100) for i in range(2)])
 43 | # poi_2 = pm.Container([pm.DiscreteUniform('poi_2_%i' % i, lower=0, upper=100) for i in range(2)])
 44 | # poi_3 = pm.Container([pm.DiscreteUniform('poi_3_%i' % i, lower=0, upper=100) for i in range(2)])
 45 | # poi_4 = pm.Container([pm.DiscreteUniform('poi_4_%i' % i, lower=0, upper=100) for i in range(2)])
 46 | # poi_5 = pm.Container([pm.DiscreteUniform('poi_5_%i' % i, lower=0, upper=100) for i in range(2)])
 47 | # @pm.deterministic(plot=False)
 48 | # def mu_1(state=state, poi_1=poi_1):
 49 | # 	return np.array([poi_1[0] if state[i] == 0 else poi_1[1] for i in xrange(len(data_sp))])
 50 | # @pm.deterministic(plot=False)
 51 | # def mu_2(state=state, poi_2=poi_2):
 52 | # 	return np.array([poi_2[0] if state[i] == 0 else poi_2[1] for i in xrange(len(data_sp))])
 53 | # @pm.deterministic(plot=False)
 54 | # def mu_3(state=state, poi_3=poi_3):
 55 | # 	return np.array([poi_3[0] if state[i] == 0 else poi_3[1] for i in xrange(len(data_sp))])
 56 | # @pm.deterministic(plot=False)
 57 | # def mu_4(state=state, poi_4=poi_4):
 58 | # 	return np.array([poi_4[0] if state[i] == 0 else poi_4[1] for i in xrange(len(data_sp))])
 59 | # @pm.deterministic(plot=False)
 60 | # def mu_5(state=state, poi_5=poi_5):
 61 | # 	return np.array([poi_5[0] if state[i] == 0 else poi_5[1] for i in xrange(len(data_sp))])
 62 | # obs_1 = pm.Normal('obs_1', mu=mu_1, tau=0.1, value=data_sp[:,0], observed=True)
 63 | # obs_2 = pm.Normal('obs_2', mu=mu_2, tau=0.1, value=data_sp[:,1], observed=True)
 64 | # obs_3 = pm.Normal('obs_3', mu=mu_3, tau=0.1, value=data_sp[:,2], observed=True)
 65 | # obs_4 = pm.Normal('obs_4', mu=mu_4, tau=0.1, value=data_sp[:,3], observed=True)
 66 | # obs_5 = pm.Normal('obs_5', mu=mu_5, tau=0.1, value=data_sp[:,4], observed=True)
 67 | # model = pm.Model([prior, state, poi_1, poi_2, poi_3, poi_4, poi_5, obs_1, obs_2, obs_3, obs_4, obs_5])
 68 | # mcmc = pm.MCMC(model)
 69 | # mcmc.sample(100)
 70 | # print state.value
 71 | # print poi_1[0].value, poi_2[0].value, poi_3[0].value, poi_4[0].value, poi_5[0].value
 72 | # print poi_1[1].value, poi_2[1].value, poi_3[1].value, poi_4[1].value, poi_5[1].value
 73 | 
 74 | prior = pm.Dirichlet('prior', np.array([50.0,50.0]))
 75 | state = pm.Container([pm.Categorical('state_%i' % i, p=prior) for i in range(len(data_tp))])
 76 | stime = pm.Container([pm.DiscreteUniform('stime_%i' % i, lower=0, upper=23) for i in range(2)])
 77 | ftime = pm.Container([pm.DiscreteUniform('ftime_%i' % i, lower=0, upper=23) for i in range(2)])
 78 | poi_1 = pm.Container([pm.DiscreteUniform('poi_1_%i' % i, lower=0, upper=100) for i in range(2)])
 79 | poi_2 = pm.Container([pm.DiscreteUniform('poi_2_%i' % i, lower=0, upper=100) for i in range(2)])
 80 | poi_3 = pm.Container([pm.DiscreteUniform('poi_3_%i' % i, lower=0, upper=100) for i in range(2)])
 81 | poi_4 = pm.Container([pm.DiscreteUniform('poi_4_%i' % i, lower=0, upper=100) for i in range(2)])
 82 | poi_5 = pm.Container([pm.DiscreteUniform('poi_5_%i' % i, lower=0, upper=100) for i in range(2)])
 83 | @pm.deterministic(plot=False)
 84 | def mu_s(state=state, stime=stime):
 85 | 	return np.array([stime[0] if state[i] == 0 else stime[1] for i in xrange(len(data_tp))])
 86 | @pm.deterministic(plot=False)
 87 | def mu_f(state=state, stime=ftime):
 88 | 	return np.array([ftime[0] if state[i] == 0 else ftime[1] for i in xrange(len(data_tp))])
 89 | @pm.deterministic(plot=False)
 90 | def mu_1(state=state, poi_1=poi_1):
 91 | 	return np.array([poi_1[0] if state[i] == 0 else poi_1[1] for i in xrange(len(data_sp))])
 92 | @pm.deterministic(plot=False)
 93 | def mu_2(state=state, poi_2=poi_2):
 94 | 	return np.array([poi_2[0] if state[i] == 0 else poi_2[1] for i in xrange(len(data_sp))])
 95 | @pm.deterministic(plot=False)
 96 | def mu_3(state=state, poi_3=poi_3):
 97 | 	return np.array([poi_3[0] if state[i] == 0 else poi_3[1] for i in xrange(len(data_sp))])
 98 | @pm.deterministic(plot=False)
 99 | def mu_4(state=state, poi_4=poi_4):
100 | 	return np.array([poi_4[0] if state[i] == 0 else poi_4[1] for i in xrange(len(data_sp))])
101 | @pm.deterministic(plot=False)
102 | def mu_5(state=state, poi_5=poi_5):
103 | 	return np.array([poi_5[0] if state[i] == 0 else poi_5[1] for i in xrange(len(data_sp))])
104 | obs_s = pm.Normal('obs_s', mu=mu_s, tau=0.1, value=data_tp[:,0], observed=True)
105 | obs_f = pm.Normal('obs_f', mu=mu_f, tau=0.1, value=data_tp[:,1], observed=True)
106 | obs_1 = pm.Normal('obs_1', mu=mu_1, tau=2, value=data_sp[:,0], observed=True)
107 | obs_2 = pm.Normal('obs_2', mu=mu_2, tau=2, value=data_sp[:,1], observed=True)
108 | obs_3 = pm.Normal('obs_3', mu=mu_3, tau=2, value=data_sp[:,2], observed=True)
109 | obs_4 = pm.Normal('obs_4', mu=mu_4, tau=2, value=data_sp[:,3], observed=True)
110 | obs_5 = pm.Normal('obs_5', mu=mu_5, tau=1, value=data_sp[:,4], observed=True)
111 | model = pm.Model([prior, state, stime, ftime, poi_1, poi_2, poi_3, poi_4, poi_5, obs_s, obs_f, obs_1, obs_2, obs_3, obs_4, obs_5])
112 | mcmc = pm.MCMC(model)
113 | mcmc.sample(100)
114 | print "state:", state.value
115 | print "stime_0:", stime[0].value, ftime[0].value
116 | print "stime_1:", stime[1].value, ftime[1].value
117 | print "poi_0:", poi_1[0].value, poi_2[0].value, poi_3[0].value, poi_4[0].value, poi_5[0].value
118 | print "poi_1:", poi_1[1].value, poi_2[1].value, poi_3[1].value, poi_4[1].value, poi_5[1].value
119 | 


--------------------------------------------------------------------------------
/code/model_baseline2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | import glob
  4 | import math
  5 | import json
  6 | import random
  7 | import fileinput
  8 | 
  9 | # 时间粒度为10分钟
 10 | # 空间粒度为200米
 11 | # 20-24为工作日
 12 | 
 13 | def euclidean(p1, p2):
 14 |     return ((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)**0.5
 15 | 
 16 | def gauss(y, mu, sigma):
 17 |     # print "gauss computation:", y, mu, sigma
 18 |     return 1./math.sqrt((sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0]))*math.exp(-0.5/(sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0])*(sigma[0][0]*(y[0]-mu[0])**2-(y[0]-mu[0])*(y[1]-mu[1])*(sigma[0][1]+sigma[1][0])+sigma[1][1]*(y[1]-mu[1])**2))
 19 | # print gauss([1,1],[0,0],[[1,0],[0,1]])
 20 | 
 21 | K, R, alpha, beta, iter_num, data = 5, 20, 0.2, 0.2, 20, []
 22 | 
 23 | def run_baseline2():
 24 |     # 数据准备  
 25 |     mu_t = [[48,108], [108,48], [60,128], [128,60], [72,84]]
 26 |     sigma_t = [[[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]]]
 27 |     assert K == len(mu_t) and K == len(sigma_t)
 28 |     print "Total Cluster: {0}".format(K)
 29 | 
 30 |     for line in fileinput.input("../data/stationary.txt"):
 31 |         st, ft, gx, gy = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]), int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6])
 32 |         ds = [((st-mu_t[k][0])**2+(ft-mu_t[k][1])**2)**0.5 for k in xrange(K)]
 33 |         data.append([st,ft,gx,gy,ds.index(min(ds)),-1])
 34 |     fileinput.close()
 35 | 
 36 |     # from sklearn import mixture
 37 |     # est = mixture.GMM(n_components=R, covariacne_type="full")
 38 |     # est.fit([session[2:4] for session in data])
 39 |     # print [[int(i) for i in list(means)] for means in est.means_]
 40 |     rs = [[165, 87], [78, 68], [77, 59], [82, 98], [46, 94], [86, 68], [37, 65], [69, 57], [77, 78], [92, 25], [44, 14], [71, 84], [66, 79], [61, 70], [71, 28], [14, 128], [100, 75], [75, 63], [85, 34], [54, 76]]
 41 |     assert R == len(rs)
 42 |     print "Total Region: {0}".format(R)
 43 | 
 44 |     # 初值选取
 45 |     for sl in data:
 46 |         rd = [euclidean(sl[2:4], rs[r]) for r in xrange(R)]
 47 |         sl[-1] = rd.index(min(rd))
 48 | 
 49 |     # 初始化
 50 |     L = len(data)
 51 |     len_k = [float(len(filter(lambda x:x[4]==k, data))) for k in xrange(K)]
 52 |     len_r = [float(len(filter(lambda x:x[5]==r, data))) for r in xrange(R)]
 53 |     len_k_r = [[float(len(filter(lambda x:x[4]==k and x[5]==r, data))) for r in xrange(R)] for k in xrange(K)]
 54 |     
 55 |     mu1_t = [[float(sum(map(lambda x:x[0],filter(lambda x:x[4]==k, data)))), float(sum(map(lambda x:x[1],filter(lambda x:x[4]==k, data))))] for k in xrange(K)]
 56 |     mu2_t = [[float(sum(map(lambda x:x[0]**2,filter(lambda x:x[4]==k, data)))), float(sum(map(lambda x:x[1]**2,filter(lambda x:x[4]==k, data)))), float(sum(map(lambda x:x[0]*x[1],filter(lambda x:x[4]==k, data))))] for k in xrange(K)]
 57 |     mu_t = [[mu1_t[k][0]/len_k[k], mu1_t[k][1]/len_k[k]] for k in xrange(K)]
 58 |     sigma_t = [[[mu2_t[k][0]/len_k[k]-mu_t[k][0]**2,(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1]],[(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1],mu2_t[k][1]/len_k[k]-mu_t[k][1]**2]] for k in xrange(K)]
 59 |     
 60 |     mu1_r = [[float(sum(map(lambda x:x[2],filter(lambda x:x[5]==r, data)))), float(sum(map(lambda x:x[3],filter(lambda x:x[5]==r, data))))] for r in xrange(R)]
 61 |     mu2_r = [[float(sum(map(lambda x:x[2]**2,filter(lambda x:x[5]==r, data)))), float(sum(map(lambda x:x[3]**2,filter(lambda x:x[5]==r, data)))), float(sum(map(lambda x:x[2]*x[3],filter(lambda x:x[5]==r, data))))] for r in xrange(R)]
 62 |     mu_r = [[mu1_r[r][0]/len_r[r], mu1_r[r][1]/len_r[r]] for r in xrange(R)]
 63 |     sigma_r = [[[mu2_r[r][0]/len_r[r]-mu_r[r][0]**2,(mu2_r[r][2]-mu_r[r][0]*mu1_r[r][1]-mu_r[r][1]*mu1_r[r][0])/len_r[r]+mu_r[r][0]*mu_r[r][1]],[(mu2_r[r][2]-mu_r[r][0]*mu1_r[r][1]-mu_r[r][1]*mu1_r[r][0])/len_r[r]+mu_r[r][0]*mu_r[r][1],mu2_r[r][1]/len_r[r]-mu_r[r][1]**2]] for r in xrange(R)]
 64 | 
 65 |     # 迭代计算
 66 |     for iter_curr in xrange(iter_num):
 67 |         likelihood = 0
 68 |         for i in xrange(L):
 69 |             item, co = data[i], data[i][4]
 70 |             # sample R
 71 |             prob = [1.*len_k_r[co][r]/len_k[co]*gauss(item[2:4],mu_r[r],sigma_r[r]) for r in xrange(R)]
 72 |             ro, rn = item[5], prob.index(max(prob))
 73 |             if rn != ro:
 74 |                 data[i][5] = rn
 75 |                 len_r[ro] -= 1; len_r[rn] += 1
 76 |                 len_k_r[co][ro] -= 1; len_k_r[co][rn] += 1
 77 |                 mu1_r[ro][0] -= item[2]; mu1_r[ro][1] -= item[3]
 78 |                 mu1_r[rn][0] += item[2]; mu1_r[rn][1] += item[3]
 79 |                 mu2_r[ro][0] -= item[2]**2; mu2_r[ro][1] -= item[3]**2; mu2_r[ro][2] -= item[2]*item[3]
 80 |                 mu2_r[rn][0] += item[2]**2; mu2_r[rn][1] += item[3]**2; mu2_r[rn][2] += item[2]*item[3]
 81 |                 mu_r = [[mu1_r[r][0]/len_r[r], mu1_r[r][1]/len_r[r]] for r in xrange(R)]
 82 |                 sigma_r = [[[mu2_r[r][0]/len_r[r]-mu_r[r][0]**2,(mu2_r[r][2]-mu_r[r][0]*mu1_r[r][1]-mu_r[r][1]*mu1_r[r][0])/len_r[r]+mu_r[r][0]*mu_r[r][1]],[(mu2_r[r][2]-mu_r[r][0]*mu1_r[r][1]-mu_r[r][1]*mu1_r[r][0])/len_r[r]+mu_r[r][0]*mu_r[r][1],mu2_r[r][1]/len_r[r]-mu_r[r][1]**2]] for r in xrange(R)]
 83 |             # sample K
 84 |             prob = [1.*len_k_r[k][rn]/L*gauss(item[:2],mu_t[k],sigma_t[k])*gauss(item[2:4],mu_r[rn],sigma_r[rn]) for k in xrange(K)]
 85 |             cn = prob.index(max(prob))
 86 |             if cn != co:
 87 |                 data[i][4] = cn
 88 |                 len_k[co] -= 1; len_k[cn] += 1
 89 |                 len_k_r[co][rn] -= 1; len_k_r[cn][rn] += 1
 90 |                 mu1_t[co][0] -= item[0]; mu1_t[co][1] -= item[1]
 91 |                 mu1_t[cn][0] += item[0]; mu1_t[cn][1] += item[1]
 92 |                 mu2_t[co][0] -= item[0]**2; mu2_t[co][1] -= item[1]**2; mu2_t[co][2] -= item[0]*item[1]
 93 |                 mu2_t[cn][0] += item[0]**2; mu2_t[cn][1] += item[1]**2; mu2_t[cn][2] += item[0]*item[1]
 94 |                 mu_t = [[mu1_t[k][0]/len_k[k], mu1_t[k][1]/len_k[k]] for k in xrange(K)]
 95 |                 sigma_t = [[[mu2_t[k][0]/len_k[k]-mu_t[k][0]**2,(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1]],[(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1],mu2_t[k][1]/len_k[k]-mu_t[k][1]**2]] for k in xrange(K)]
 96 |             
 97 |             prob_max = 1.*len_k[cn]/sum(len_k)*len_k_r[cn][rn]/sum(len_k_r[cn])*gauss(item[:2],mu_t[cn],sigma_t[cn])*gauss(item[2:4],mu_r[rn],sigma_r[rn])
 98 |             likelihood += -math.log10(prob_max)
 99 | 
100 |         print iter_curr, likelihood
101 | 
102 |     with open('model_save/baseline2.txt','w') as f:
103 |         f.write(json.dumps({"len_k":len_k,
104 |                             "len_k_r":len_k_r,
105 |                             "mu_t":mu_t,
106 |                             "sigma_t":sigma_t,
107 |                             "mu_r":mu_r,
108 |                             "sigma_r":sigma_r}))
109 | 
110 | def compute_error():
111 |     import numpy as np
112 | 
113 |     param = json.loads(open('model_save/baseline2.txt','r').read())
114 |     len_k, len_k_r = param['len_k'], param['len_k_r']
115 |     mu_t, sigma_t = param['mu_t'], param['sigma_t']
116 |     mu_r, sigma_r = param['mu_r'], param['sigma_r']
117 | 
118 |     # 时间分布
119 |     matrix1 = [[0 for j in xrange(24*6)] for i in xrange(24*6)]
120 |     for line in fileinput.input("../data/stationary.txt"):
121 |         st, ft = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4])
122 |         matrix1[st][ft] += 1
123 |     fileinput.close()
124 |     matrix2 = [[0 for j in xrange(24*6)] for i in xrange(24*6)]
125 |     for k in xrange(K):
126 |         for st in xrange(24*6):
127 |             for ft in xrange(24*6):
128 |                 matrix2[st][ft] += 1.*(len_k[k]/sum(len_k))*gauss([st,ft],mu_t[k],sigma_t[k])
129 |     matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum()
130 |     matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum()
131 |     print "Temporal reconstruction accuracy:", 1-(abs(matrix1-matrix2)[40:60,100:140].sum()+abs(matrix1-matrix2)[100:140,40:60].sum())
132 | 
133 |     # 空间分布
134 |     matrix1 = [[0 for j in xrange(150)] for i in xrange(225)]
135 |     for line in fileinput.input("../data/stationary.txt"):
136 |         gx, gy = int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6])
137 |         matrix1[gx][gy] += 1
138 |     fileinput.close()
139 |     matrix2 = [[0 for j in xrange(150)] for i in xrange(225)]
140 |     for k in xrange(K):
141 |         for r in xrange(R):
142 |             for gx in xrange(225):
143 |                 for gy in xrange(150):
144 |                     matrix2[gx][gy] += 1.*(len_k[k]/sum(len_k))*(len_k_r[k][r]/sum(len_k_r[k]))*gauss([gx,gy],mu_r[r],sigma_r[r])
145 |     matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum()
146 |     matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum()
147 |     print "Spatial reconstruction accuracy:", 1-abs(matrix1-matrix2)[50:90,50:90].sum()
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     # run_baseline2()
152 |     compute_error()
153 | 


--------------------------------------------------------------------------------
/code/stationary_segmentation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | import gzip
  4 | import fileinput
  5 | 
  6 | # 时间粒度为10分钟
  7 | # 空间粒度为200米
  8 | # 20-24为工作日
  9 | 
 10 | def euclidean(p1, p2):
 11 | 	return 200*((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)**0.5
 12 | 
 13 | # 轨迹段分布
 14 | def plot_segmentation_distribution():
 15 | 	from pylab import *
 16 | 	from scipy import interpolate
 17 | 	from matplotlib.ticker import MultipleLocator, FormatStrFormatter
 18 | 
 19 | 	fig = plt.figure(figsize=(10,5))
 20 | 	fig.subplots_adjust(left=0.05,right=0.98)
 21 | 
 22 | 	users, delta = {}, 2
 23 | 	for line in fileinput.input("../data/stationary.txt"):
 24 | 		uid, _, _, st, ft, _, _ = line.strip().split(" ") # 时间粒度为十分钟
 25 | 		st, ft = int(st), int(ft)
 26 | 		users[uid] = users.get(uid,[])
 27 | 		day = users[uid][-1][3]/(24*6)+(users[uid][-1][1]>=st) if users[uid] else 0
 28 | 		if len(users[uid]) == 0 or (24*6)*day+st - users[uid][-1][3] >= delta*6:
 29 | 			users[uid].append([st, ft, (24*6)*day+st, (24*6)*(day+(st>ft))+ft, (ft>st)])
 30 | 	fileinput.close()
 31 | 
 32 | 	day_total, row_total = 6, 60
 33 | 	matrixs, row = [[0 for h in xrange(day_total*24*6)] for u in xrange(row_total)], -1
 34 | 	for uid, slices in users.iteritems():
 35 | 		if slices[-1][-2] <= (day_total-1)*24*6 or slices[-1][-2] >= day_total*24*6-1:
 36 | 			continue
 37 | 		row += 1
 38 | 		if row == row_total:
 39 | 			break
 40 | 		for sl in slices:
 41 | 			color = -1 if sl[-1] else 1
 42 | 			for h in xrange(sl[2],sl[3]+1):
 43 | 				matrixs[row][h] = color
 44 | 	
 45 | 	ax = fig.add_subplot(121)
 46 | 	(X, Y) = meshgrid(np.arange(day_total*24*6), np.arange(row_total))
 47 | 	C = np.array(matrixs)
 48 | 	plt.pcolormesh(X, Y, C, cmap='RdBu', vmin=-2, vmax=2)
 49 | 	plt.xlim(0,day_total*24*6-1)
 50 | 	plt.ylim(0,row_total-1)
 51 | 	xmajorLocator = MultipleLocator(24*6)
 52 | 	xmajorFormatter = FormatStrFormatter('%d')
 53 | 	ax.xaxis.set_major_locator(xmajorLocator)
 54 | 	ax.xaxis.set_major_formatter(xmajorFormatter)
 55 | 	plt.xlabel('Time slot /10min')
 56 | 	plt.ylabel('User')
 57 | 
 58 | 	users, delta = {}, 2
 59 | 	for line in fileinput.input("../data/stationary.txt"):
 60 | 		uid, _, _, st, ft, _, _ = line.strip().split(" ") # 时间粒度为十分钟
 61 | 		st, ft = int(st), int(ft)
 62 | 		users[uid] = users.get(uid,[])
 63 | 		day = users[uid][-1][3]/(24*6)+(users[uid][-1][1]>=st) if users[uid] else 0
 64 | 		users[uid].append([st, ft, (24*6)*day+st, (24*6)*(day+(st>ft))+ft, (ft>st)])
 65 | 	fileinput.close()
 66 | 
 67 | 	distribution = {}
 68 | 	for uid, slices in users.iteritems():
 69 | 		for i in xrange(1,len(slices)):
 70 | 			interval = (slices[i][2]-slices[i-1][3])/3
 71 | 			distribution[interval] = distribution.get(interval,0)+1
 72 | 	distribution = [distribution.get(t,0) for t in xrange(2*12)]
 73 | 	distribution = [1-1.*sum(distribution[t:])/sum(distribution) for t in xrange(2*12)]
 74 | 	ax1 = fig.add_subplot(122)
 75 | 	tck = interpolate.splrep(range(len(distribution)),distribution,s=0)
 76 | 	xnew = np.arange(0,2*12,0.1)
 77 | 	ynew = interpolate.splev(xnew,tck,der=0)
 78 | 	plt.plot(xnew,ynew,'k-',label="Interval",linewidth=2)
 79 | 	plt.xlim(1,12)
 80 | 	plt.ylim(0,1.)
 81 | 	plt.xlabel('Time slot /30min')
 82 | 	plt.ylabel('CDF')
 83 | 	# handles, labels = ax1.get_legend_handles_labels()
 84 | 	# ax1.legend(handles, labels)
 85 | 	xmajorLocator = MultipleLocator(1)
 86 | 	xmajorFormatter = FormatStrFormatter('%d')
 87 | 	ax1.xaxis.set_major_locator(xmajorLocator)
 88 | 	ax1.xaxis.set_major_formatter(xmajorFormatter)
 89 | 	# show()
 90 | 	for postfix in ('eps','png'):
 91 | 		savefig('../figure/{0}/04.{0}'.format(postfix))
 92 | 
 93 | # 阈值距离为1000米，时间为1小时，仅筛选出工作日
 94 | def stationary_accurate_detection():
 95 | 	min_distance, min_duration, max_duration, min_session = 1000, 1*60/10, 1*60/10, 10
 96 | 	with open("../data/stationary_accurate.txt", "w") as f:
 97 | 		line_num = 0
 98 | 		for line in gzip.open("../data/trace.txt.gz"):
 99 | 			line_num += 1
100 | 			print line_num
101 | 			uid = line.strip().split("\t")[0]
102 | 			session_list, session_current, slices = [], [], [(int(sl.split(":")[0]), \
103 | 															  int(sl.split(":")[1]), \
104 | 															  sum([int(p.split(",")[0]) for p in sl.split(":")[2].split("-")])/len(sl.split(":")[2].split("-")), \
105 | 															  sum([int(p.split(",")[1]) for p in sl.split(":")[2].split("-")])/len(sl.split(":")[2].split("-"))) \
106 | 																for sl in line.strip().split("\t")[1].split("|")]
107 | 			for sl in slices:
108 | 				if len(session_current) == 0:
109 | 					session_current = [sl]
110 | 				else:
111 | 					if euclidean(sl[2:],session_current[-1][2:]) >= min_distance:
112 | 						if session_current[-1][0]-session_current[0][0] >= min_duration and sl[0]-session_current[-1][0] <= max_duration:
113 | 							session_list.append(session_current)
114 | 						session_current = [sl]
115 | 					else:
116 | 						session_current.append(sl)
117 | 			if len(session_list) >= min_session:
118 | 				for i in range(1,len(session_list)-1):
119 | 					if len(session_list[i]) >= 2 and 1*24*60/10 < session_list[i][-1][0] and session_list[i][0][0] < 6*24*60/10:
120 | 						f.write(uid+" "+str(round(float(session_list[i][0][0]%(24*60/10))/(60/10),2))+" "+\
121 | 										str(round(float(session_list[i][-1][0]%(24*60/10))/(60/10),2))+" "+\
122 | 										str(session_list[i][0][0]%(24*60/10))+" "+\
123 | 										str(session_list[i][-1][0]%(24*60/10))+" "+\
124 | 										str(sum([session[2] for session in session_list[i]])/len(session_list[i]))+" "+\
125 | 										str(sum([session[3] for session in session_list[i]])/len(session_list[i]))+"\n") 
126 | 
127 | def segmentation_detection(function="", method="probability"):
128 | 	if function == "segment" and not method in ["probability", "median","cut"]:
129 | 		exit()
130 | 
131 | 	def uniform_prob(N):
132 | 		prob = [0]*(N)
133 | 		for i in xrange(N):
134 | 			for j in xrange(i,N):
135 | 				if (i+j)%2 == 0:
136 | 					prob[int(1.0*(i+j)/2)] += 1
137 | 				else:
138 | 					prob[int(1.0*(i+j)/2-0.5)] += 0.5
139 | 					prob[int(1.0*(i+j)/2+0.5)] += 0.5
140 | 		return [prob[i]/sum(prob) for i in xrange(N)]
141 | 
142 | 	import random
143 | 	global_prob, local_prob, user_prob, valid_set, delta = {}, {}, {}, [], 5*3
144 | 	for line in fileinput.input("../data/stationary_accurate.txt"):
145 | 		if function == "plot":
146 | 			uid, st, ft, _, _, gx, gy = line.strip().split(" ") # 时间粒度为每小时
147 | 		elif function == "segment":
148 | 			uid, _, _, st, ft, gx, gy = line.strip().split(" ") # 时间粒度为十分钟
149 | 		else:
150 | 			exit()
151 | 		st, ft, gx, gy = int(float(st)), int(float(ft)), int(gx), int(gy)
152 | 		global_prob[st] = global_prob.get(st,0)+1
153 | 		global_prob[ft] = global_prob.get(ft,0)+1
154 | 		local_prob[(gx, gy)] = local_prob.get((gx, gy),{})
155 | 		local_prob[(gx, gy)][st] = local_prob[(gx, gy)].get(st,0)+1
156 | 		local_prob[(gx, gy)][ft] = local_prob[(gx, gy)].get(ft,0)+1
157 | 		user_prob[uid] = user_prob.get(uid,{})
158 | 		user_prob[uid][st] = user_prob[uid].get(st,0)+1
159 | 		user_prob[uid][ft] = user_prob[uid].get(ft,0)+1
160 | 		valid_set.append([uid,st,ft,gx,gy,\
161 | 							max(st-random.randint(0,delta),0),\
162 | 							min(st+random.randint(0,delta),24*6-1),\
163 | 							max(ft-random.randint(0,delta),0),\
164 | 							min(ft+random.randint(0,delta),24*6-1)])
165 | 	fileinput.close()
166 | 
167 | 	# 概率时间分布（时间粒度为每小时）
168 | 	if function == "plot":
169 | 		import matplotlib.pyplot as plt
170 | 		line, = plt.plot(range(24), [global_prob[h] for h in range(24)], '-', linewidth=2)
171 | 		show()
172 | 		for gx in xrange(255):
173 | 			for gy in xrange(150):
174 | 				if (gx, gy) in local_prob and len(local_prob[(gx, gy)]) == 24:
175 | 					line, = plt.plot(range(24), [local_prob[(gx, gy)].get(h,0) for h in xrange(24)], '-', linewidth=2)
176 | 					show()
177 | 
178 | 	# 切分点预测（时间粒度为十分钟）
179 | 	if function == "segment":
180 | 		alpha_global, alpha_local, alpha_user, error = 0.4, 0.4, 0.4, 0.0
181 | 		for uid,st,ft,gx,gy,stb,ste,ftb,fte in valid_set:
182 | 			if method == "probability":
183 | 				probs_global = [global_prob.get(h,0)+(global_prob.get(h-1,0)+global_prob.get(h+1,0))*0.5 
184 | 									for h in range(stb,ste+1)]
185 | 				probs_local = [local_prob.get((gx,gy),{}).get(h,0)+(local_prob.get((gx,gy),{}).get(h-1,0)+local_prob.get((gx,gy),{}).get(h+1,0))*0.5
186 | 									for h in range(stb,ste+1)]
187 | 				probs_user = [user_prob.get(uid,{}).get(h,0)+(user_prob.get(uid,{}).get(h-1,0)+user_prob.get(uid,{}).get(h+1,0))*0.5
188 | 									for h in range(stb,ste+1)]
189 | 				probs_global = [(1-alpha_global)*prob/sum(probs_global)+alpha_global/len(probs_global) for prob in probs_global]
190 | 				probs_local = [(1-alpha_local)*prob/sum(probs_local)+alpha_local/len(probs_local) for prob in probs_local]
191 | 				probs_user = [(1-alpha_user)*prob/sum(probs_user)+alpha_user/len(probs_user) for prob in probs_user]
192 | 				probs_uniform = uniform_prob(ste-stb+1)
193 | 				probs = [probs_uniform[h]+probs_global[h]+probs_local[h]+probs_user[h] for h in range(ste-stb+1)]
194 | 				error += abs(st-(stb+probs.index(max(probs))))
195 | 			elif method == "median":
196 | 				error += abs(st-round(1.0*(stb+ste)/2,0))
197 | 			elif method == "cut":
198 | 				error += abs(st-round(1.0*ste,0))
199 | 
200 | 		print "method={0}, MAE={1}".format(method, 10*(error/len(valid_set)))
201 | 
202 | 
203 | if __name__ == "__main__":
204 | 	plot_segmentation_distribution()
205 | 	# stationary_accurate_detection()
206 | 	# segmentation_detection("plot")
207 | 	# segmentation_detection("segment","probability")
208 | 	# segmentation_detection("segment","median")
209 | 	# segmentation_detection("segment","cut")
210 | 
211 | # 2*30min
212 | # method=probability, MAE=5.80136525334
213 | # method=median, MAE=11.3674646669
214 | # method=cut, MAE=29.5907124315
215 | # 3*30min
216 | # method=probability, MAE=8.06143640035
217 | # method=median, MAE=16.5051437362
218 | # method=cut, MAE=44.0982597827
219 | # 4*30min
220 | # method=probability, MAE=11.9503893856
221 | # method=median, MAE=21.4089029901
222 | # method=cut, MAE=58.3183347755
223 | # 5*30min
224 | # method=probability, MAE=15.1259494279
225 | # method=median, MAE=28.3938082877
226 | # method=cut, MAE=72.5562926642
227 | 
228 | 


--------------------------------------------------------------------------------
/code/examples_sklearn/sk_gmm.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import matplotlib as mpl
  5 | from scipy import linalg
  6 | from sklearn import cluster
  7 | 
  8 | EPS = np.finfo(float).eps
  9 | 
 10 | def log_multivariate_normal_density(X, means, covars, covariance_type='diag'):
 11 |     log_multivariate_normal_density_dict = {
 12 |         'spherical': _log_multivariate_normal_density_spherical,
 13 |         'tied': _log_multivariate_normal_density_tied,
 14 |         'diag': _log_multivariate_normal_density_diag,
 15 |         'full': _log_multivariate_normal_density_full}
 16 |     return log_multivariate_normal_density_dict[covariance_type](X, means, covars)
 17 | 
 18 | def distribute_covar_matrix_to_match_covariance_type(tied_cv, covariance_type, n_components):
 19 |     if covariance_type == 'spherical':
 20 |         cv = np.tile(tied_cv.mean() * np.ones(tied_cv.shape[1]), (n_components, 1))
 21 |     elif covariance_type == 'tied':
 22 |         cv = tied_cv
 23 |     elif covariance_type == 'diag':
 24 |         cv = np.tile(np.diag(tied_cv), (n_components, 1))
 25 |     elif covariance_type == 'full':
 26 |         cv = np.tile(tied_cv, (n_components, 1, 1))
 27 |     else:
 28 |         raise ValueError("covariance_type must be one of 'spherical', 'tied', 'diag', 'full'")
 29 |     return cv
 30 | 
 31 | def _covar_mstep_spherical(*args):
 32 |     cv = _covar_mstep_diag(*args)
 33 |     return np.tile(cv.mean(axis=1)[:, np.newaxis], (1, cv.shape[1]))
 34 | 
 35 | def _covar_mstep_diag(gmm, X, responsibilities, weighted_X_sum, norm, min_covar):
 36 |     avg_X2 = np.dot(responsibilities.T, X * X) * norm
 37 |     avg_means2 = gmm.means_ ** 2
 38 |     avg_X_means = gmm.means_ * weighted_X_sum * norm
 39 |     return avg_X2 - 2 * avg_X_means + avg_means2 + min_covar
 40 | 
 41 | def _covar_mstep_tied(gmm, X, responsibilities, weighted_X_sum, norm, min_covar):
 42 |     n_features = X.shape[1]
 43 |     avg_X2 = np.dot(X.T, X)
 44 |     avg_means2 = np.dot(gmm.means_.T, weighted_X_sum)
 45 |     return (avg_X2 - avg_means2 + min_covar * np.eye(n_features)) / X.shape[0]
 46 | 
 47 | def _covar_mstep_full(gmm, X, responsibilities, weighted_X_sum, norm, min_covar):
 48 |     n_features = X.shape[1]
 49 |     cv = np.empty((gmm.n_components, n_features, n_features))
 50 |     for c in range(gmm.n_components):
 51 |         post = responsibilities[:, c]
 52 |         np.seterr(under='ignore')
 53 |         avg_cv = np.dot(post * X.T, X) / (post.sum() + 10 * EPS)
 54 |         mu = gmm.means_[c][np.newaxis]
 55 |         cv[c] = (avg_cv - np.dot(mu.T, mu) + min_covar * np.eye(n_features))
 56 |     return cv
 57 | 
 58 | _covar_mstep_funcs = {'spherical': _covar_mstep_spherical,
 59 |                       'diag': _covar_mstep_diag,
 60 |                       'tied': _covar_mstep_tied,
 61 |                       'full': _covar_mstep_full}
 62 | 
 63 | def _log_multivariate_normal_density_spherical(X, means, covars):
 64 |     cv = covars.copy()
 65 |     if covars.ndim == 1:
 66 |         cv = cv[:, np.newaxis]
 67 |     if covars.shape[1] == 1:
 68 |         cv = np.tile(cv, (1, X.shape[-1]))
 69 |     return _log_multivariate_normal_density_diag(X, means, cv)
 70 | 
 71 | def _log_multivariate_normal_density_tied(X, means, covars):
 72 |     n_samples, n_dim = X.shape
 73 |     icv = pinvh(covars)
 74 |     lpr = -0.5 * (n_dim * np.log(2 * np.pi) + np.log(linalg.det(covars) + 0.1) + np.sum(X * np.dot(X, icv), 1)[:, np.newaxis] - 2 * np.dot(np.dot(X, icv), means.T) + np.sum(means * np.dot(means, icv), 1))
 75 |     return lpr
 76 | 
 77 | def _log_multivariate_normal_density_diag(X, means, covars):
 78 |     n_samples, n_dim = X.shape
 79 |     lpr = -0.5 * (n_dim * np.log(2 * np.pi) + np.sum(np.log(covars), 1) + np.sum((means ** 2) / covars, 1) - 2 * np.dot(X, (means / covars).T) + np.dot(X ** 2, (1.0 / covars).T))
 80 |     return lpr
 81 | 
 82 | def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7):
 83 |     n_samples, n_dim = X.shape
 84 |     nmix = len(means)
 85 |     log_prob = np.empty((n_samples, nmix))
 86 |     for c, (mu, cv) in enumerate(zip(means, covars)):
 87 |         try:
 88 |             cv_chol = linalg.cholesky(cv, lower=True)
 89 |         except linalg.LinAlgError:
 90 |             cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim), lower=True)
 91 |         cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol)))
 92 |         cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T
 93 |         log_prob[:, c] = - .5 * (np.sum(cv_sol ** 2, axis=1) + n_dim * np.log(2 * np.pi) + cv_log_det)
 94 |     return log_prob
 95 | 
 96 | def logsumexp(arr, axis=0):
 97 |     arr = np.rollaxis(arr, axis)
 98 |     vmax = arr.max(axis=0)
 99 |     out = np.log(np.sum(np.exp(arr - vmax), axis=0))
100 |     out += vmax
101 |     return out
102 | 
103 | def pinvh(a, cond=None, rcond=None, lower=True):
104 |     a = np.asarray_chkfinite(a)
105 |     s, u = linalg.eigh(a, lower=lower)
106 |     if rcond is not None:
107 |         cond = rcond
108 |     if cond in [None, -1]:
109 |         t = u.dtype.char.lower()
110 |         factor = {'f': 1E3, 'd': 1E6}
111 |         cond = factor[t] * np.finfo(t).eps
112 |     above_cutoff = (abs(s) > cond * np.max(abs(s)))
113 |     psigma_diag = np.zeros_like(s)
114 |     psigma_diag[above_cutoff] = 1.0 / s[above_cutoff]
115 |     return np.dot(u * psigma_diag, np.conjugate(u).T)
116 | 
117 | class GMM():
118 |     def __init__(self, n_components=1, covariance_type='diag', thresh=1e-2, min_covar=1e-3, n_iter=100, n_init=1, params='wmc', init_params='wmc'):
119 |         self.n_components = n_components
120 |         self.covariance_type = covariance_type
121 |         self.thresh = thresh
122 |         self.min_covar = min_covar
123 |         self.n_iter = n_iter
124 |         self.n_init = n_init
125 |         self.params = params
126 |         self.init_params = init_params
127 |         if not covariance_type in ['spherical', 'tied', 'diag', 'full']: 
128 |             raise ValueError('Invalid value for covariance_type: %s' % covariance_type)
129 |         if n_init < 1:
130 |             raise ValueError('GMM estimation requires at least one run')
131 |         self.weights_ = np.ones(self.n_components) / self.n_components
132 |         self.converged_ = False
133 | 
134 |     def _get_covars(self):
135 |         if self.covariance_type == 'full':
136 |             return self.covars_
137 |         elif self.covariance_type == 'diag':
138 |             return [np.diag(cov) for cov in self.covars_]
139 |         elif self.covariance_type == 'tied':
140 |             return [self.covars_] * self.n_components
141 |         elif self.covariance_type == 'spherical':
142 |             return [np.diag(cov) for cov in self.covars_]
143 | 
144 |     def score_samples(self, X):
145 |         X = np.asarray(X)
146 |         if X.ndim == 1:
147 |             X = X[:, np.newaxis]
148 |         if X.size == 0:
149 |             return np.array([]), np.empty((0, self.n_components))
150 |         if X.shape[1] != self.means_.shape[1]:
151 |             raise ValueError('The shape of X  is not compatible with self')
152 |         lpr = (log_multivariate_normal_density(X, self.means_, self.covars_, self.covariance_type) + np.log(self.weights_))
153 |         logprob = logsumexp(lpr, axis=1)
154 |         responsibilities = np.exp(lpr - logprob[:, np.newaxis])
155 |         return logprob, responsibilities
156 | 
157 |     def score(self, X):
158 |         logprob, _ = self.score_samples(X)
159 |         return logprob
160 | 
161 |     def predict(self, X):
162 |         logprob, responsibilities = self.score_samples(X)
163 |         return responsibilities.argmax(axis=1)
164 | 
165 |     def predict_proba(self, X):
166 |         logprob, responsibilities = self.score_samples(X)
167 |         return responsibilities
168 | 
169 |     def fit(self, X):
170 |         X = np.asarray(X, dtype=np.float)
171 |         if X.ndim == 1:
172 |             X = X[:, np.newaxis]
173 |         if X.shape[0] < self.n_components:
174 |             raise ValueError('GMM estimation with %s components, but got only %s samples' % (self.n_components, X.shape[0]))
175 |         max_log_prob = -np.infty
176 |         print self.init_params
177 |         for _ in range(self.n_init):
178 |             if 'm' in self.init_params or not hasattr(self, 'means_'):
179 |                 self.means_ = cluster.KMeans(n_clusters=self.n_components).fit(X).cluster_centers_
180 |             if 'w' in self.init_params or not hasattr(self, 'weights_'):
181 |                 self.weights_ = np.tile(1.0 / self.n_components, self.n_components)
182 |             if 'c' in self.init_params or not hasattr(self, 'covars_'):
183 |                 cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1])
184 |                 if not cv.shape:
185 |                     cv.shape = (1, 1)
186 |                 self.covars_ = distribute_covar_matrix_to_match_covariance_type(cv, self.covariance_type, self.n_components)
187 |             # EM algorithms
188 |             log_likelihood = []
189 |             self.converged_ = False
190 |             for i in range(self.n_iter):
191 |                 # Expectation step
192 |                 curr_log_likelihood, responsibilities = self.score_samples(X)
193 |                 log_likelihood.append(curr_log_likelihood.sum())
194 |                 # Check for convergence
195 |                 if i > 0 and abs(log_likelihood[-1] - log_likelihood[-2]) < self.thresh:
196 |                     self.converged_ = True
197 |                     break
198 |                 # Maximization step
199 |                 self._do_mstep(X, responsibilities, self.params, self.min_covar)
200 |             # if the results are better, keep it
201 |             if self.n_iter:
202 |                 if log_likelihood[-1] > max_log_prob:
203 |                     max_log_prob = log_likelihood[-1]
204 |                     best_params = {'weights': self.weights_, 'means': self.means_, 'covars': self.covars_}
205 |         # check the existence of an init param that was not subject to likelihood computation issue
206 |         if np.isneginf(max_log_prob) and self.n_iter:
207 |             raise RuntimeError("EM algorithm was never able to compute a valid likelihood given initial parameters. Try different init parameters (or increasing n_init) or check for degenerate data.")
208 |         if self.n_iter:
209 |             self.covars_ = best_params['covars']
210 |             self.means_ = best_params['means']
211 |             self.weights_ = best_params['weights']
212 |         return self
213 | 
214 |     def _do_mstep(self, X, responsibilities, params, min_covar=0):
215 |         weights = responsibilities.sum(axis=0)
216 |         weighted_X_sum = np.dot(responsibilities.T, X)
217 |         inverse_weights = 1.0 / (weights[:, np.newaxis] + 10 * EPS)
218 |         if 'w' in params:
219 |             self.weights_ = (weights / (weights.sum() + 10 * EPS) + EPS)
220 |         if 'm' in params:
221 |             self.means_ = weighted_X_sum * inverse_weights
222 |         if 'c' in params:
223 |             covar_mstep_func = _covar_mstep_funcs[self.covariance_type]
224 |             self.covars_ = covar_mstep_func(
225 |                 self, X, responsibilities, weighted_X_sum, inverse_weights,
226 |                 min_covar)
227 |         return weights
228 | 
229 |     def _n_parameters(self):
230 |         ndim = self.means_.shape[1]
231 |         if self.covariance_type == 'full':
232 |             cov_params = self.n_components * ndim * (ndim + 1) / 2.
233 |         elif self.covariance_type == 'diag':
234 |             cov_params = self.n_components * ndim
235 |         elif self.covariance_type == 'tied':
236 |             cov_params = ndim * (ndim + 1) / 2.
237 |         elif self.covariance_type == 'spherical':
238 |             cov_params = self.n_components
239 |         mean_params = ndim * self.n_components
240 |         return int(cov_params + mean_params + self.n_components - 1)
241 | 
242 |     def bic(self, X):
243 |         return (-2 * self.score(X).sum() + self._n_parameters() * np.log(X.shape[0]))
244 | 
245 |     def aic(self, X):
246 |         return - 2 * self.score(X).sum() + 2 * self._n_parameters()
247 | 
248 | n_samples = 500
249 | np.random.seed(0)
250 | C = np.array([[0., -0.1], [1.7, .4]])
251 | X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])]
252 | gmm = GMM(n_components=2, covariance_type='spherical')
253 | gmm.fit(X)
254 | clf, title = gmm, 'GMM'
255 | splot = plt.subplot(1, 1, 1)
256 | Y_ = clf.predict(X)
257 | for i, (mean, covar, color) in enumerate(zip(clf.means_, clf._get_covars(), itertools.cycle(['r', 'g']))):
258 |     v, w = linalg.eigh(covar)
259 |     u = w[0] / linalg.norm(w[0])
260 |     plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)
261 |     angle = np.arctan(u[1] / u[0])
262 |     angle = 180 * angle / np.pi
263 |     ell = mpl.patches.Ellipse(mean, v[0]*3, v[1]*3, 180 + angle, color=color)
264 |     ell.set_clip_box(splot.bbox)
265 |     ell.set_alpha(0.5)
266 |     splot.add_artist(ell)
267 | plt.xlim(-10, 10)
268 | plt.ylim(-3, 6)
269 | plt.xticks(())
270 | plt.yticks(())
271 | plt.title(title)
272 | plt.show()
273 | 


--------------------------------------------------------------------------------
/code/model_combine.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | import glob
  4 | import math
  5 | import json
  6 | import random
  7 | import fileinput
  8 | 
  9 | # 时间粒度为10分钟
 10 | # 空间粒度为200米
 11 | # 20-24为工作日
 12 | 
 13 | def euclidean(p1, p2):
 14 |     return ((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)**0.5
 15 | 
 16 | def gauss(y, mu, sigma):
 17 |     # print "gauss computation:", y, mu, sigma
 18 |     return 1./math.sqrt((sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0]))*math.exp(-0.5/(sigma[0][0]*sigma[1][1]-sigma[0][1]*sigma[1][0])*(sigma[0][0]*(y[0]-mu[0])**2-(y[0]-mu[0])*(y[1]-mu[1])*(sigma[0][1]+sigma[1][0])+sigma[1][1]*(y[1]-mu[1])**2))
 19 | # print gauss([1,1],[0,0],[[1,0],[0,1]])
 20 | 
 21 | K, R, alpha, beta, iter_num, data = 5, 20, 0.2, 0.2, 20, []
 22 | 
 23 | def run_model(save=True):
 24 |     # 数据准备  
 25 |     mu_t = [[48,108], [108,48], [60,128], [128,60], [72,84]]
 26 |     sigma_t = [[[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]], [[36,0],[36,0]]]
 27 |     assert K == len(mu_t) and K == len(sigma_t)
 28 |     print "Total Cluster: {0}".format(K)
 29 | 
 30 |     for line in fileinput.input("../data/stationary.txt"):
 31 |         st, ft, gx, gy = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4]), int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6])
 32 |         ds = [((st-mu_t[k][0])**2+(ft-mu_t[k][1])**2)**0.5 for k in xrange(K)]
 33 |         data.append([st,ft,gx,gy,ds.index(min(ds)),-1])
 34 |     fileinput.close()
 35 | 
 36 |     # from sklearn import mixture
 37 |     # est = mixture.GMM(n_components=R, covariance_type="full")
 38 |     # est.fit([session[2:4] for session in data])
 39 |     # print [[int(i) for i in list(means)] for means in est.means_]
 40 |     rs = [[165, 87], [78, 68], [77, 59], [82, 98], [46, 94], [86, 68], [37, 65], [69, 57], [77, 78], [92, 25], [44, 14], [71, 84], [66, 79], [61, 70], [71, 28], [14, 128], [100, 75], [75, 63], [85, 34], [54, 76]]
 41 |     assert R == len(rs)
 42 |     print "Total Region: {0}".format(R)
 43 | 
 44 |     # 初值选取
 45 |     for sl in data:
 46 |         rd = [euclidean(sl[2:4], rs[r]) for r in xrange(R)]
 47 |         sl[-1] = rd.index(min(rd))
 48 | 
 49 |     # 初始化
 50 |     L = len(data)
 51 |     len_k = [float(len(filter(lambda x:x[4]==k, data))) for k in xrange(K)]
 52 |     len_k_r = [[float(len(filter(lambda x:x[4]==k and x[5]==r, data))) \
 53 |                     for r in xrange(R)] for k in xrange(K)]
 54 | 
 55 |     mu1_t = [[float(sum(map(lambda x:x[0],filter(lambda x:x[4]==k, data)))), \
 56 |               float(sum(map(lambda x:x[1],filter(lambda x:x[4]==k, data))))] \
 57 |                 for k in xrange(K)]
 58 |     mu2_t = [[float(sum(map(lambda x:x[0]**2,filter(lambda x:x[4]==k, data)))), \
 59 |               float(sum(map(lambda x:x[1]**2,filter(lambda x:x[4]==k, data)))), \
 60 |               float(sum(map(lambda x:x[0]*x[1],filter(lambda x:x[4]==k, data))))] \
 61 |                 for k in xrange(K)]
 62 |     mu_t = [[mu1_t[k][0]/len_k[k], mu1_t[k][1]/len_k[k]] for k in xrange(K)]
 63 |     sigma_t = [[[mu2_t[k][0]/len_k[k]-mu_t[k][0]**2,\
 64 |                 (mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1]],\
 65 |                [(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1],\
 66 |                 mu2_t[k][1]/len_k[k]-mu_t[k][1]**2]] \
 67 |                     for k in xrange(K)]
 68 | 
 69 |     mu1_k_r = [[[float(sum(map(lambda x:x[2],filter(lambda x:x[4]==k and x[5]==r, data)))), \
 70 |                  float(sum(map(lambda x:x[3],filter(lambda x:x[4]==k and x[5]==r, data))))] \
 71 |                     for r in xrange(R)] for k in xrange(K)]
 72 |     mu2_k_r = [[[float(sum(map(lambda x:x[2]**2,filter(lambda x:x[4]==k and x[5]==r, data)))), \
 73 |                  float(sum(map(lambda x:x[3]**2,filter(lambda x:x[4]==k and x[5]==r, data)))), \
 74 |                  float(sum(map(lambda x:x[2]*x[3],filter(lambda x:x[4]==k and x[5]==r, data))))] \
 75 |                     for r in xrange(R)] for k in xrange(K)]
 76 |     mu_k_r = [[[mu1_k_r[k][r][0]/len_k_r[k][r], mu1_k_r[k][r][1]/len_k_r[k][r]] for r in xrange(R)] for k in xrange(K)]
 77 |     sigma_k_r = [[[[mu2_k_r[k][r][0]/len_k_r[k][r]-mu_k_r[k][r][0]**2,\
 78 |                     (mu2_k_r[k][r][2]-mu_k_r[k][r][0]*mu1_k_r[k][r][1]-mu_k_r[k][r][1]*mu1_k_r[k][r][0])/len_k_r[k][r]+mu_k_r[k][r][0]*mu_k_r[k][r][1]],\
 79 |                    [(mu2_k_r[k][r][2]-mu_k_r[k][r][0]*mu1_k_r[k][r][1]-mu_k_r[k][r][1]*mu1_k_r[k][r][0])/len_k_r[k][r]+mu_k_r[k][r][0]*mu_k_r[k][r][1],\
 80 |                     mu2_k_r[k][r][1]/len_k_r[k][r]-mu_k_r[k][r][1]**2]] \
 81 |                         for r in xrange(R)] for k in xrange(K)]
 82 | 
 83 |     # 迭代计算
 84 |     def find_index(X, a):
 85 |         for i in xrange(len(X)):
 86 |             for j in xrange(len(X[0])):
 87 |                 if X[i][j] == a:
 88 |                     return (i, j)
 89 | 
 90 |     for iter_curr in xrange(iter_num):
 91 |         likelihood = 0
 92 |         for i in xrange(L):
 93 |             item, old_k, old_r = data[i], data[i][4], data[i][5]
 94 |             probs = [[1.*len_k[k]/sum(len_k)*len_k_r[k][r]/sum(len_k_r[k])*\
 95 |                         gauss(item[0:2],mu_t[k],sigma_t[k])*\
 96 |                         gauss(item[2:4],mu_k_r[k][r],sigma_k_r[k][r]) \
 97 |                             for r in xrange(R)] for k in xrange(K)]
 98 |             prob_max = max([max(prob) for prob in probs])
 99 |             likelihood += -math.log10(sum([sum(prob) for prob in probs]))
100 |             new_k, new_r = find_index(probs, prob_max)
101 |             data[i][4], data[i][5] = new_k, new_r
102 |             len_k[old_k] -= 1; len_k[new_k] += 1
103 |             len_k_r[old_k][old_r] -= 1; len_k_r[new_k][new_r] += 1
104 |             mu1_t[old_k][0] -= item[0]; mu1_t[old_k][1] -= item[1]
105 |             mu1_t[new_k][0] += item[0]; mu1_t[new_k][1] += item[1]
106 |             mu2_t[old_k][0] -= item[0]**2; mu2_t[old_k][1] -= item[1]**2; mu2_t[old_k][2] -= item[0]*item[1]
107 |             mu2_t[new_k][0] += item[0]**2; mu2_t[new_k][1] += item[1]**2; mu2_t[new_k][2] += item[0]*item[1]
108 |             mu_t = [[mu1_t[k][0]/len_k[k], mu1_t[k][1]/len_k[k]] for k in xrange(K)]
109 |             sigma_t = [[[mu2_t[k][0]/len_k[k]-mu_t[k][0]**2,\
110 |                         (mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1]],\
111 |                        [(mu2_t[k][2]-mu_t[k][0]*mu1_t[k][1]-mu_t[k][1]*mu1_t[k][0])/len_k[k]+mu_t[k][0]*mu_t[k][1],\
112 |                         mu2_t[k][1]/len_k[k]-mu_t[k][1]**2]] \
113 |                             for k in xrange(K)]
114 |             mu1_k_r[old_k][old_r][0] -= item[2]; mu1_k_r[old_k][old_r][1] -= item[3]
115 |             mu1_k_r[new_k][new_r][0] += item[2]; mu1_k_r[new_k][new_r][1] += item[3]
116 |             mu2_k_r[old_k][old_r][0] -= item[2]**2; mu2_k_r[old_k][old_r][1] -= item[3]**2; mu2_k_r[old_k][old_r][2] -= item[2]*item[3];
117 |             mu2_k_r[new_k][new_r][0] += item[2]**2; mu2_k_r[new_k][new_r][1] += item[3]**2; mu2_k_r[new_k][new_r][2] += item[2]*item[3];
118 |             mu_k_r = [[[mu1_k_r[k][r][0]/len_k_r[k][r], mu1_k_r[k][r][1]/len_k_r[k][r]] for r in xrange(R)] for k in xrange(K)]
119 |             sigma_k_r = [[[[mu2_k_r[k][r][0]/len_k_r[k][r]-mu_k_r[k][r][0]**2,\
120 |                             (mu2_k_r[k][r][2]-mu_k_r[k][r][0]*mu1_k_r[k][r][1]-mu_k_r[k][r][1]*mu1_k_r[k][r][0])/len_k_r[k][r]+mu_k_r[k][r][0]*mu_k_r[k][r][1]],\
121 |                            [(mu2_k_r[k][r][2]-mu_k_r[k][r][0]*mu1_k_r[k][r][1]-mu_k_r[k][r][1]*mu1_k_r[k][r][0])/len_k_r[k][r]+mu_k_r[k][r][0]*mu_k_r[k][r][1],\
122 |                             mu2_k_r[k][r][1]/len_k_r[k][r]-mu_k_r[k][r][1]**2]] \
123 |                                 for r in xrange(R)] for k in xrange(K)]
124 |         print iter_curr, likelihood, len_k
125 | 
126 |         if save:
127 |             with open('model_save/iter_{0}.txt'.format(str(iter_curr).zfill(2)),'w') as f:
128 |                 f.write(json.dumps({"likelihood":likelihood,
129 |                                     "len_k":len_k,
130 |                                     "len_k_r":len_k_r,
131 |                                     "mu_t":mu_t,
132 |                                     "sigma_t":sigma_t,
133 |                                     "mu_k_r":mu_k_r,
134 |                                     "sigma_k_r":sigma_k_r}))
135 | 
136 | def plot_distribution(iter_curr):
137 |     from pylab import *
138 | 
139 |     param = json.loads(open('model_save/iter_{0}.txt'.format(iter_curr),'r').read())
140 |     len_k, len_k_r = param['len_k'], param['len_k_r']
141 |     mu_t, sigma_t = param['mu_t'], param['sigma_t']
142 |     mu_k_r, sigma_k_r = param['mu_k_r'], param['sigma_k_r']
143 | 
144 |     # 时间分布
145 |     plt.figure(figsize=(12,5))
146 |     norm1 = cm.colors.Normalize(vmax=0.0020, vmin=0)
147 |     for c, k in enumerate([4,0,1,2,3]):
148 |         matrix = [[0 for j in xrange(24*6)] for i in xrange(24*6)]
149 |         for ts in xrange(24*6):
150 |             for tf in xrange(24*6):
151 |                 matrix[ts][tf] = 1.*(len_k[k]/sum(len_k))*gauss([ts,tf],mu_t[k],sigma_t[k])
152 |         (X, Y), C = meshgrid(np.arange(24*6), np.arange(24*6)), np.array(matrix)
153 |         subplot(2,5,1+c)
154 |         cset = pcolormesh(X, Y, C.T, cmap=cm.get_cmap("OrRd"), norm=norm1)
155 |         plt.axis([0, 24*6, 0, 24*6])
156 |         plt.xticks(np.linspace(0,24*6,7))
157 |         plt.yticks(np.linspace(0,24*6,7))
158 |         if c == 0:
159 |             plt.xlabel('Session start time slot /10min')
160 |             plt.ylabel('Session end time slot /10min')
161 |     cax1 = axes([0.92, 0.54, 0.01, 0.35])
162 |     colorbar(cax=cax1)
163 |     # plt.axis('off')
164 | 
165 |     # 空间分布
166 |     subplots_adjust(hspace=0.4)
167 |     norm2 = cm.colors.Normalize(vmax=0.0040, vmin=0)
168 |     for c, k in enumerate([4,0,1,2,3]):
169 |         matrix = [[0 for j in xrange(150)] for i in xrange(225)]
170 |         for gx in xrange(225):
171 |             for gy in xrange(150):
172 |                 matrix[gx][gy] = 1.*(len_k[k]/sum(len_k))*sum([1.*(len_k_r[k][r]/sum(len_k_r[k]))*gauss([gx,gy],mu_k_r[k][r],sigma_k_r[k][r]) for r in xrange(R)])
173 |         (X, Y), C = meshgrid(np.arange(100), np.arange(100)), np.array(matrix)[20:120,20:120]
174 |         subplot(2,5,6+c)
175 |         cset = pcolormesh(X, Y, C.T, cmap=cm.get_cmap("OrRd"), norm=norm2)
176 |         plt.axis([0, 100-1, 0, 100-1])
177 |         plt.xticks(np.linspace(0,100,6))
178 |         plt.yticks(np.linspace(0,100,6))
179 |         if c == 0:
180 |             plt.xlabel('Longitude grid index /200m')
181 |             plt.ylabel('Latitude grid index /200m')
182 |     subplots_adjust(bottom=0.1, left=0.06, right=0.9, top=0.9)
183 |     cax2 = axes([0.92, 0.09, 0.01, 0.35])
184 |     colorbar(cax=cax2)
185 |     # plt.axis('off')
186 |     # show()
187 |     for postfix in ('eps','png'):
188 |         savefig('../figure/{0}/05.{0}'.format(postfix))
189 | 
190 | def plot_iteration_likelihood():
191 |     from pylab import *
192 | 
193 |     iterations, likelihoods = [], []
194 |     for iteration, filename in enumerate(sorted(glob.glob(r"model_save/iter_*.txt"))):
195 |         likelihood = json.loads(open(filename,'r').read()).get("likelihood",0)
196 |         iterations.append(iteration)
197 |         likelihoods.append(likelihood/10**4)
198 | 
199 |     fig = plt.figure()
200 |     ax1 = fig.add_subplot(111)
201 |     plot(iterations, likelihoods, 'k-', label="Likelihood", linewidth=2)
202 |     plt.xlabel('Number for iteration')
203 |     plt.ylabel('$-10^{-4} \\times$ log likelihood')
204 |     handles, labels = ax1.get_legend_handles_labels()
205 |     ax1.legend(handles, labels)
206 |     # show()
207 |     for postfix in ('eps','png'):
208 |         savefig('../figure/{0}/06.{0}'.format(postfix))
209 | 
210 | def compute_error(iter_curr):
211 |     import numpy as np
212 | 
213 |     param = json.loads(open('model_save/iter_{0}.txt'.format(iter_curr),'r').read())
214 |     len_k, len_k_r = param['len_k'], param['len_k_r']
215 |     mu_t, sigma_t = param['mu_t'], param['sigma_t']
216 |     mu_k_r, sigma_k_r = param['mu_k_r'], param['sigma_k_r']
217 | 
218 |     # 时间分布
219 |     matrix1 = [[0 for j in xrange(24*6)] for i in xrange(24*6)]
220 |     for line in fileinput.input("../data/stationary.txt"):
221 |         st, ft = int(line.strip().split(" ")[3]), int(line.strip().split(" ")[4])
222 |         matrix1[st][ft] += 1
223 |     fileinput.close()
224 |     matrix2 = [[0 for j in xrange(24*6)] for i in xrange(24*6)]
225 |     for k in xrange(K):
226 |         for st in xrange(24*6):
227 |             for ft in xrange(24*6):
228 |                 matrix2[st][ft] += 1.*(len_k[k]/sum(len_k))*gauss([st,ft],mu_t[k],sigma_t[k])
229 |     matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum()
230 |     matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum()
231 |     print "Temporal reconstruction accuracy:", 1-(abs(matrix1-matrix2)[40:60,100:140].sum()+abs(matrix1-matrix2)[100:140,40:60].sum())
232 | 
233 |     # 空间分布
234 |     matrix1 = [[0 for j in xrange(150)] for i in xrange(225)]
235 |     for line in fileinput.input("../data/stationary.txt"):
236 |         gx, gy = int(line.strip().split(" ")[5]), int(line.strip().split(" ")[6])
237 |         matrix1[gx][gy] += 1
238 |     fileinput.close()
239 |     matrix2 = [[0 for j in xrange(150)] for i in xrange(225)]
240 |     for k in xrange(K):
241 |         for r in xrange(R):
242 |             for gx in xrange(225):
243 |                 for gy in xrange(150):
244 |                     matrix2[gx][gy] += 1.*(len_k[k]/sum(len_k))*(len_k_r[k][r]/sum(len_k_r[k]))*gauss([gx,gy],mu_k_r[k][r],sigma_k_r[k][r])
245 |     matrix1 = 1.*np.array(matrix1)/np.array(matrix1).sum()
246 |     matrix2 = 1.*np.array(matrix2)/np.array(matrix2).sum()
247 |     print "Spatial reconstruction accuracy:", 1-abs(matrix1-matrix2)[50:90,50:90].sum()
248 | 
249 | 
250 | if __name__ == "__main__":
251 |     # run_model(save=True)
252 |     plot_distribution(19)
253 |     # plot_iteration_likelihood()
254 |     # compute_error(19)
255 | 
256 | # Likelihood
257 | # model:     2636999
258 | # baseline1: 3336792
259 | # baseline2: 2644190
260 | 
261 | # model:
262 | # Temporal reconstruction accuracy: 0.867786928604
263 | # Spatial reconstruction accuracy: 0.746619059294
264 | # baseline1:
265 | # Temporal reconstruction accuracy: 0.861796706542
266 | # Spatial reconstruction accuracy: 0.659659953592
267 | # baseline2:
268 | # Temporal reconstruction accuracy: 0.868461451694
269 | # Spatial reconstruction accuracy: 0.719261119801
270 | 
271 | 


--------------------------------------------------------------------------------