├── config
    ├── default.yml
    ├── model_config.yml
    └── test_params.yml
├── _config.yml
├── img
    ├── posterior.png
    ├── rbf_kernel.png
    ├── linear_kernel.png
    ├── periodic_kernel.png
    ├── prior_vs_posterior.png
    └── gaussian_process_samples.png
├── .gitignore
├── mod
    ├── __init__.py
    └── config
    │   ├── __init__.py
    │   └── config_loader.py
├── core
    ├── __init__.py
    ├── gaussian_process
    │   ├── __init__.py
    │   ├── kernels.py
    │   └── sampling.py
    └── gaussian_distribution
    │   ├── __init__.py
    │   └── prob_dist_func.py
├── src
    ├── settings.py
    └── gpr_modeling.py
└── README.md


/config/default.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/config/model_config.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/config/test_params.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal
2 | markdown: kramdown
3 | 


--------------------------------------------------------------------------------
/img/posterior.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ulti-Dreisteine/gaussian-process-regression/HEAD/img/posterior.png


--------------------------------------------------------------------------------
/img/rbf_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ulti-Dreisteine/gaussian-process-regression/HEAD/img/rbf_kernel.png


--------------------------------------------------------------------------------
/img/linear_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ulti-Dreisteine/gaussian-process-regression/HEAD/img/linear_kernel.png


--------------------------------------------------------------------------------
/img/periodic_kernel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ulti-Dreisteine/gaussian-process-regression/HEAD/img/periodic_kernel.png


--------------------------------------------------------------------------------
/img/prior_vs_posterior.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ulti-Dreisteine/gaussian-process-regression/HEAD/img/prior_vs_posterior.png


--------------------------------------------------------------------------------
/img/gaussian_process_samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ulti-Dreisteine/gaussian-process-regression/HEAD/img/gaussian_process_samples.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | tmp/
 3 | trash/
 4 | .DS_Store/
 5 | *.xml
 6 | *.iml
 7 | *.pyc
 8 | 
 9 | .git/
10 | .vscode/
11 | __pycache__/
12 | cache/
13 | mod/logs/
14 | pkg/
15 | trash/


--------------------------------------------------------------------------------
/mod/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/09/01 09:49:06
 4 | 
 5 | @File -> __init__.py
 6 | 
 7 | @Author: luolei
 8 | 
 9 | @Email: dreisteine262@163.com
10 | 
11 | @Describe: 初始化
12 | """
13 | 
14 | __all__ = []


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/09/01 10:08:25
 4 | 
 5 | @File -> __init__.py
 6 | 
 7 | @Author: luolei
 8 | 
 9 | @Email: dreisteine262@163.com
10 | 
11 | @Describe: 初始化
12 | """
13 | 
14 | __all__ = []


--------------------------------------------------------------------------------
/mod/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/02/27 21:14:34
 4 | 
 5 | @File -> __init__.py
 6 | 
 7 | @Author: luolei
 8 | 
 9 | @Email: dreisteine262@163.com
10 | 
11 | @Describe: 初始化
12 | """
13 | 
14 | __all__ = []


--------------------------------------------------------------------------------
/core/gaussian_process/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/09/01 10:17:50
 4 | 
 5 | @File -> __init__.py
 6 | 
 7 | @Author: luolei
 8 | 
 9 | @Email: dreisteine262@163.com
10 | 
11 | @Describe: 初始化
12 | """
13 | 
14 | from .kernels import calKernelFunc
15 | 
16 | __all__ = ['calKernelFunc']


--------------------------------------------------------------------------------
/core/gaussian_distribution/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/09/01 10:15:27
 4 | 
 5 | @File -> __init__.py
 6 | 
 7 | @Author: luolei
 8 | 
 9 | @Email: dreisteine262@163.com
10 | 
11 | @Describe: 初始化
12 | """
13 | 
14 | from .prob_dist_func import univar_gaussian, multivar_gaussian
15 | 
16 | __all__ = ['univar_gaussian', 'multivar_gaussian']


--------------------------------------------------------------------------------
/src/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/1/21 下午3:01
 4 | 
 5 | @Project -> File: pollution-forecast-offline-training-version-2 -> settings.py
 6 | 
 7 | @Author: luolei
 8 | 
 9 | @Email: dreisteine262@163.com
10 | 
11 | @Describe: 默认设置
12 | """
13 | 
14 | import sys
15 | import os
16 | 
17 | BASE_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), '../' * 2))
18 | sys.path.append(BASE_DIR)
19 | 
20 | from mod.config.config_loader import config_loader
21 | 
22 | PROJ_DIR, PROJ_CMAP = config_loader.proj_dir, config_loader.proj_cmap
23 | plt = config_loader.proj_plt
24 | 
25 | # 载入项目变量配置.
26 | ENC_CONFIG = config_loader.environ_config
27 | MODEL_CONFIG = config_loader.model_config
28 | TEST_PARAMS = config_loader.test_params
29 | 
30 | # ---- 定义环境变量 ---------------------------------------------------------------------------------
31 | 
32 | # ---- 定义模型参数 ---------------------------------------------------------------------------------
33 | 
34 | # ---- 定义测试参数 ---------------------------------------------------------------------------------
35 | 
36 | # ---- 定义通用函数 ---------------------------------------------------------------------------------
37 | 


--------------------------------------------------------------------------------
/core/gaussian_process/kernels.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/1/30 18:04
 4 | 
 5 | @Project -> File: gaussian-process-regression -> kernels.py
 6 | 
 7 | @Author: luolei
 8 | 
 9 | @Email: dreisteine262@163.com
10 | 
11 | @Describe: 高斯过程核函数
12 | """
13 | 
14 | import numpy as np
15 | 
16 | 
17 | def _rbf_kernel(x_a, x_b, sigma: float = 0.8, l: float = 0.5):
18 |     """
19 |     RBF高斯核函数, 又名Exponetiated Quadratic, 公式为:
20 |     k = sigma^2 * exp(-norm(x_a - x_b)^2 / (2 * l^2))
21 |     :param x_a: float, 样本值位置a
22 |     :param x_b: float, 样本值位置b
23 |     :param sigma: float, 方差参数
24 |     :param l: float, 长度参数length
25 |     """
26 |     n = np.linalg.norm(x_a - x_b)
27 |     k = pow(sigma, 2) * np.exp(- pow(n, 2) / (2 * pow(l, 2)))
28 |     return k
29 | 
30 | 
31 | def _periodic_kernel(x_a, x_b, sigma: float = 0.8, l: float = 0.5, p: float = 0.5):
32 |     """
33 |     周期性核函数, 公式为:
34 |     k = sigma^2 * np.exp(-(2 / l^2) * sin(pi / p * |x_a - x_b|)^2)
35 |     :param x_a: float, 样本值位置a
36 |     :param x_b: float, 样本值位置b
37 |     :param sigma: float, 方差参数
38 |     :param l: float > 0.0, 长度参数length
39 |     :param p: flaot > 0.0, 周期参数
40 |     """
41 |     assert (l > 0.0) & (p > 0.0)
42 |     sin = np.sin(np.pi / p * np.abs(x_a - x_b))
43 |     k = pow(sigma, 2) * np.exp(-2 / pow(l, 2) * pow(sin, 2))
44 |     return k
45 | 
46 | 
47 | def _linear_kernel(x_a, x_b, sigma: float = 0.8, sigma_b: float = 0.5, c: float = 0.5):
48 |     """
49 |     线性核函数, 公式为:
50 |     k = sigma_b^2 + sigma^2 * (x_a - c)(x_b - c)
51 |     :param x_a: float, 样本值位置a
52 |     :param x_b: float, 样本值位置b
53 |     :param sigma: float, 方差参数
54 |     :param sigma_b: float, 方差参数b
55 |     :param c: float, offset参数
56 |     """
57 |     k = pow(sigma_b, 2) + pow(sigma, 2) * (x_a - c) * (x_b - c)
58 |     return k
59 | 
60 | 
61 | def calKernelFunc(x_a, x_b, kernel_name, **kwargs):
62 |     """核函数计算"""
63 | 
64 |     if kernel_name == 'RBF':
65 |         k = _rbf_kernel(x_a, x_b, **kwargs)
66 |         return k
67 |     elif kernel_name == 'linear':
68 |         k = _linear_kernel(x_a, x_b, **kwargs)
69 |         return k
70 |     elif kernel_name == 'periodic':
71 |         k = _periodic_kernel(x_a, x_b, **kwargs)
72 |         return k
73 |     else:
74 |         raise ValueError('Unknown kernel func name "{}".'.format(kernel_name))
75 | 


--------------------------------------------------------------------------------
/core/gaussian_distribution/prob_dist_func.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2020/2/20 21:07
 4 | 
 5 | @Project -> File: gaussian-process-regression -> prob_dist_func.py
 6 | 
 7 | @Author: luolei
 8 | 
 9 | @Email: dreisteine262@163.com
10 | 
11 | @Describe: 概率密度函数
12 | """
13 | 
14 | import numpy as np
15 | 
16 | 
17 | def univar_gaussian(x, mu, sigma):
18 | 	"""
19 | 	一维高斯分布概率密度函数
20 | 	:param x: float or array like, x值或向量
21 | 	:param mu: float, 均值
22 | 	:param sigma: float, 标准差
23 | 	"""
24 | 	expo = np.exp(-np.power(x - mu, 2) / (2 * np.power(sigma, 2)))
25 | 	f = 1 / (sigma * np.sqrt(2 * np.pi)) * expo
26 | 	return f
27 | 
28 | 
29 | def multivar_gaussian(x, mu, Sigma):
30 | 	"""
31 | 	多维高斯分布概率密度函数
32 | 	:param x: array like, x向量
33 | 	:param mu: array like, 均值向量
34 | 	:param Sigma: np.array, 协方差矩阵
35 | 	:return:
36 | 	"""
37 | 	x = np.array(x).reshape(-1, 1)
38 | 	mu = np.array(mu).reshape(-1, 1)
39 | 	dim_x = x.shape[0]
40 | 	
41 | 	try:
42 | 		assert mu.shape[0] == dim_x
43 | 		assert Sigma.shape[0] == Sigma.shape[1] == dim_x
44 | 	except:
45 | 		raise ValueError('The shape of mu or Sigma does not correspond to the dimension of x')
46 | 	
47 | 	expo = np.exp(-0.5 * np.dot(np.dot((x - mu).T, np.linalg.inv(Sigma)), x - mu))
48 | 	f = 1 / (np.power(2 * np.pi, dim_x / 2) * np.power(np.linalg.det(Sigma), 0.5)) * expo
49 | 	return f[0]
50 | 
51 | 
52 | if __name__ == '__main__':
53 | 	import sys
54 | 	import os
55 | 	
56 | 	BASE_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), '../' * 3))
57 | 	sys.path.append(BASE_DIR)
58 | 	
59 | 	from src.settings import plt
60 | 	
61 | 	# ---- 一维高斯分布 -----------------------------------------------------------------------------
62 | 	
63 | 	mu = 0.0
64 | 	sigma = 1.0
65 | 	x = np.arange(-10.0, 10.0 + 0.1, 0.1).reshape(-1, 1)
66 | 	y = univar_gaussian(x, mu, sigma)
67 | 
68 | 	plt.figure('One Dimensional PDF')
69 | 	plt.plot(x, y)
70 | 	
71 | 	# ---- 二维高斯分布 -----------------------------------------------------------------------------
72 | 	
73 | 	from mod.gaussian_process.sampling import cal_covariance_matrix
74 | 	
75 | 	mu = [3.0, 1.0]
76 | 	idx = [0.0, 1.0]
77 | 	kernel_name = 'RBF'
78 | 	kernel_params = {'sigma': 2.0, 'l': 1.0}
79 | 	Sigma = cal_covariance_matrix(idx, kernel_name = kernel_name, kernel_params = kernel_params)
80 | 	
81 | 	x = np.arange(-10.0, 10.0 + 0.2, 0.2).reshape(-1, 1)
82 | 	y = np.arange(-10.0, 10.0 + 0.2, 0.2).reshape(-1, 1)
83 | 	mesh_x, mesh_y = np.meshgrid(x, y)
84 | 	coords = np.dstack((mesh_x, mesh_y))
85 | 	pdf = np.apply_along_axis(lambda x: multivar_gaussian(x, mu, Sigma), 2, coords)
86 | 	pdf = pdf.reshape(len(x), len(y))  
87 | 
88 | 	plt.figure('Two Dimensional PDF')
89 | 	plt.contourf(mesh_x, mesh_y, pdf, cmap = 'Blues')
90 | 	


--------------------------------------------------------------------------------
/src/gpr_modeling.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021/09/01 10:33:40
 4 | 
 5 | @File -> gpr_modeling.py
 6 | 
 7 | @Author: luolei
 8 | 
 9 | @Email: dreisteine262@163.com
10 | 
11 | @Describe: 高斯过程回归建模
12 | """
13 | 
14 | import numpy as np
15 | import sys
16 | import os
17 | 
18 | BASE_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), '../' * 2))
19 | sys.path.append(BASE_DIR)
20 | 
21 | from src.settings import PROJ_DIR, plt
22 | from core.gaussian_process.sampling import calCovMatrix, genGaussianProcessSamples
23 | 
24 | if __name__ == '__main__':
25 |     t_series = np.arange(0, 10, 0.1)
26 | 
27 |     fig, axs = plt.subplots(2, 1, figsize=(8, 6))
28 | 
29 |     # ---- 先验分布 ---------------------------------------------------------------------------------
30 | 
31 |     mu = np.zeros([t_series.shape[0], 1])
32 |     C = calCovMatrix(t_series, kernel_name='RBF')
33 | 
34 |     samples_prior = genGaussianProcessSamples(t_series, mu, C, samples_n=500)
35 | 
36 |     # 画图.
37 |     ax = axs[0]
38 |     for i in range(samples_prior.shape[1]):
39 |         ax.plot(t_series, samples_prior[:, i], c='grey', alpha=0.1)
40 |     ax.plot(t_series, np.mean(samples_prior, axis=1), c='b')
41 |     ax.set_ylabel('$y$')
42 |     ax.set_title('Prior Distribution', fontsize = 15)
43 | 
44 |     # ---- 采集后验样本 -----------------------------------------------------------------------------
45 | 
46 |     # 采集后验样本.
47 |     t_obs = np.array([1.1, 1.0, 4.0, 6.0, 7.0, 7.5])
48 |     x_obs = np.array([1.0, 1.0, 0.5, 1.0, 2.0, 2.0])
49 |     N_obs = x_obs.shape[0]
50 | 
51 |     # 计算先验分布参数.
52 |     t_total = np.hstack((t_obs, t_series))
53 |     mu_prioir = np.zeros_like(t_total)
54 |     C_prior = calCovMatrix(t_total, kernel_name='RBF')
55 | 
56 |     # 更新后验分布参数, 1代表观测值, 2代表未知值.
57 |     mu_1, mu_2 = mu_prioir[:N_obs].reshape(-1, 1), mu_prioir[N_obs:].reshape(-1, 1)
58 |     C_11 = C_prior[:N_obs, :N_obs]
59 |     C_12 = C_prior[:N_obs, N_obs:]
60 |     C_21 = C_prior[N_obs:, :N_obs]
61 |     C_22 = C_prior[N_obs:, N_obs:]
62 |     mu_post = mu_2 + np.dot(np.dot(C_21, np.linalg.inv(C_11)), x_obs.reshape(-1, 1) - mu_1)
63 |     sigma_post = C_22 - np.dot(np.dot(C_21, np.linalg.inv(C_11)), C_12)
64 | 
65 |     samples_post = genGaussianProcessSamples(t_series, mu_post, sigma_post, samples_n=500)
66 | 
67 |     # 画图.
68 |     ax = axs[1]
69 |     for i in range(samples_post.shape[1]):
70 |         ax.plot(t_series, samples_post[:, i], c='grey', alpha=0.1, zorder = -i)
71 |     for i in range(N_obs):
72 |         ax.scatter(t_obs[i], x_obs[i], s = 60, c = 'k')
73 |     ax.plot(t_series, np.mean(samples_post, axis=1), c='b', zorder=0)
74 |     ax.set_xlabel('$t$')
75 |     ax.set_ylabel('$y$')
76 |     ax.set_title('Posterior Distribution', fontsize = 15)
77 | 
78 |     fig.tight_layout()
79 |     fig.savefig(os.path.join(PROJ_DIR, 'img/prior_vs_posterior.png'), dpi = 450)
80 | 


--------------------------------------------------------------------------------
/core/gaussian_process/sampling.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on 2021/09/01 10:37:22
  4 | 
  5 | @File -> sampling.py
  6 | 
  7 | @Author: luolei
  8 | 
  9 | @Email: dreisteine262@163.com
 10 | 
 11 | @Describe: 
 12 | """
 13 | 
 14 | import numpy as np
 15 | import sys
 16 | import os
 17 | 
 18 | BASE_DIR = os.path.abspath(os.path.join(os.path.abspath(__file__), '../' * 3))
 19 | sys.path.append(BASE_DIR)
 20 | 
 21 | from core.gaussian_process.kernels import calKernelFunc
 22 | 
 23 | __doc__ = """
 24 |     参考文献: 
 25 |         https://www.jgoertler.com/visual-exploration-gaussian-processes/#GaussianProcesses
 26 | 		https://blog.csdn.net/shenxiaolu1984/article/details/50386518
 27 | """
 28 | 
 29 | def calCovMatrix(t_series: np.ndarray, kernel_name: str, **kernel_params):
 30 | 	"""计算协方差矩阵C"""
 31 | 	# 生成采样点序列x.
 32 | 	N = len(t_series)
 33 | 	
 34 | 	# 通过高斯核函数计算采样点之间的相关函数矩阵C.
 35 | 	C = np.zeros([N, N])
 36 | 	for i in range(N):
 37 | 		for j in range(i, N):
 38 | 			t_i, t_j = t_series[i], t_series[j]
 39 | 			C[i, j] = calKernelFunc(t_i, t_j, kernel_name, **kernel_params)
 40 | 	C = C + np.tril(C.T, -1)  # **高斯核函数具有对称性质, 所以此处为矩阵与转置矩阵下三角之和
 41 | 	return C
 42 | 
 43 | 
 44 | def _execGPRSampling(t_series, mu, C) -> np.ndarray:
 45 | 	"""
 46 | 	根据给定的时间, 各时间步之间计算所得均值和协方差矩阵进行'单次'高斯过程采样
 47 | 	这部分过程和原理可以参考以下介绍高斯过程采样算法的材料:
 48 | 	:param t_series: 时间值序列
 49 | 	:param mu: 均值向量
 50 | 	:param C: 协方差矩阵
 51 | 	
 52 | 	Example:
 53 | 	------
 54 | 	t_series = np.arange(0, 10, 0.1)
 55 | 	x_series = gpr_sampling(t_series, mu, C)
 56 | 	"""
 57 | 	t_series = t_series.flatten()
 58 | 	
 59 | 	# 对C进行SVD分解.
 60 | 	U, sigmas, _ = np.linalg.svd(C)
 61 | 	S = np.diag(sigmas)  # 向量转为对角矩阵
 62 | 	
 63 | 	# 生成N个独立同分布高斯随机变量.
 64 | 	y_series = np.random.normal(loc = 0.0, scale = 1.0, size = (len(t_series),))
 65 | 	x_series = np.dot(np.dot(U, np.sqrt(S)), y_series.reshape(-1, 1))
 66 | 	
 67 | 	# 加上均值, 得到样本向量.
 68 | 	x_series += mu
 69 | 		
 70 | 	return x_series
 71 | 
 72 | 
 73 | def genGaussianProcessSamples(t_series: np.ndarray, mu: np.ndarray, C: np.ndarray, samples_n):
 74 | 	"""
 75 | 	生成高斯过程样本集
 76 | 	:param t_series: 时刻series, shape = (Nt, 1)
 77 | 	:param mu: np.array, 均值向量, shape = (Nt, 1)
 78 | 	:param C: np.array, 协方差矩阵, shape = (Nt, N)
 79 | 	:param sample_n: int, 采样数
 80 | 	:return samples, shape = (Nt, Ns)
 81 | 
 82 | 	Example:
 83 | 	------
 84 | 	t_series = np.array([0.0, 1.0, 3.0, 4.0, 7.0])
 85 |     mu = np.zeros_like(t_series)
 86 |     C = np.random.random((t_series.shape[0], t_series.shape[0]))
 87 |     samples = genGaussianProcessSamples(t_series, mu, C, samples_n = 10)
 88 | 	"""
 89 | 	t_series = t_series.reshape(-1, 1)
 90 | 	mu = mu.reshape(-1, 1)
 91 | 	
 92 |     # 维数检查.
 93 | 	dims = [t_series.shape[0], mu.shape[0], C.shape[0], C.shape[1]]
 94 | 	assert len(set(dims)) == 1
 95 | 	
 96 | 	samples = None
 97 | 	for i in range(samples_n):
 98 | 		x = _execGPRSampling(t_series, mu, C)
 99 | 		if i == 0:
100 | 			samples = x
101 | 		else:
102 | 			samples = np.hstack((samples, x))
103 | 	return samples
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     ...
108 | 
109 | 
110 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <script type="text/x-mathjax-config">
  2 |     MathJax.Hub.Config({
  3 |       tex2jax: {
  4 |         skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'],
  5 |         inlineMath: [['$','$']]
  6 |       }
  7 |     });
  8 | </script>
  9 | <script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
 10 | 
 11 | 
 12 | ![封面](img/prior_vs_posterior.png)
 13 | 
 14 | # 高斯过程回归
 15 | ##  Gaussian Process Regression
 16 | 
 17 | 
 18 | ***
 19 | ## 一、高斯分布
 20 | 
 21 | 高斯过程（Gaussian Process, GP）是随机过程之一，是一系列符合正态分布的随机变量在一指数集（index set）内的集合。该解释中的“指数”可以理解为“维度“，按照机器学习的角度，各个指数上的随机变量可以对应地理解为各个维度上的特征。
 22 | 
 23 | ***1.1 一元高斯分布***  
 24 | 
 25 | $$
 26 | X \sim N(\mu, \sigma^2)
 27 | $$
 28 | 
 29 | 其概率密度函数为：
 30 | 
 31 | $$
 32 | f(x) = \frac{1}{\sigma \sqrt{2 \pi}} \exp({-(x - \mu)^2} / ({2 \sigma^2}))
 33 | $$
 34 | 
 35 | 标准正态分布：
 36 | 
 37 | $$
 38 | \mu = 0, \sigma = 1
 39 | $$
 40 | 
 41 | 正态分布具有如下性质：
 42 | 
 43 | 1. 如果$X \sim N(\mu, \sigma^2)$，且$a$、$b$均为实数，则$aX + b \sim N(a \mu + b, (a \sigma)^2)$；
 44 | 
 45 | 2. 如果$X \sim N(\mu_x, \sigma_x^2)$与$Y \sim N(\mu_y, \sigma_y^2)$独立，则：
 46 | 
 47 |    1. $U = X + Y \sim N(\mu_x + \mu_y, \sigma_x^2 + \sigma_y^2)$；
 48 |    2. $V = X - Y \sim N(\mu_x - \mu_y, \sigma_x^2 + \sigma_y^2)$；
 49 | 
 50 | 3. 若以上$X$与$Y$相互独立，则：
 51 | 
 52 |    1. $XY$符合以下概率密度分布：
 53 | 
 54 |       $$
 55 |       p(z)=\frac{1}{\pi \sigma_x \sigma_y}K_0(\frac{|z|}{\sigma_x \sigma_y})
 56 |       $$
 57 | 
 58 |       其中$K_0$为修正贝塞尔函数；
 59 | 
 60 |    2. $X/Y$符合柯西分布：
 61 | 
 62 |       $$
 63 |       X/Y \sim {\rm Cauchy}(0, \sigma_x / \sigma_y)
 64 |       $$
 65 | 
 66 | 4. 若$X_1, ..., X_n$各自独立，符合正态分布，则$X_1^2 + X_2^2 + ... + X_n^2$符合自由度为$n$的卡方分布；
 67 | 
 68 | 
 69 | ***1.2 二元高斯分布***
 70 | 
 71 | $$
 72 | f(x,y) = A \exp (-(\frac{(x - x_0)^2}{2\sigma_x^2} + \frac{(y - y_0)^2}{2\sigma_y^2}))
 73 | $$
 74 | 
 75 | ***1.3 多元高斯分布***
 76 | 
 77 | $$
 78 | p(x) = \frac{1}{(2\pi)^{n/2} |\Sigma|^{1/2}} \ \exp(-\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu))
 79 | $$
 80 | 
 81 | 其中，$\mu$为各随机变量的均值组成的$n \times 1$向量，$\Sigma$表示随机变量间的$n \times n$协方差矩阵，正定。
 82 | 
 83 | 
 84 | ***
 85 | ## 二、多元高斯分布的条件概率密度
 86 | 
 87 | 令随机向量$X = [x_1, x_2, ..., x_n]$服从多元高斯分布$X \sim N(\mu, \Sigma)$，令$X_1 = [x_1, ..., x_m]$为已经观测变量，$X_2 = [x_{m+1}, ..., x_n]$为未知变量，则：
 88 | 
 89 | $$
 90 | \begin{aligned}
 91 | X = \left(
 92 | 	\begin{array}{c}
 93 | 	X_1 \\
 94 | 	X_2
 95 | 	\end{array}
 96 | \right)
 97 | \end{aligned}
 98 | $$
 99 | 
100 | 从而有：
101 | 
102 | $$
103 | \begin{aligned}
104 | \mu = \left(
105 | 	\begin{array}{c}
106 | 	\mu_1 \\
107 | 	\mu_2
108 | 	\end{array}
109 | \right)
110 | \end{aligned}
111 | $$
112 | 
113 | $$
114 | \begin{aligned}
115 | \Sigma = \left[
116 | 	\begin{aligned}{}
117 | 	\Sigma_{11}, &\Sigma_{12} \\
118 | 	\Sigma_{21}, &\Sigma_{22}
119 | 	\end{aligned}
120 | \right]
121 | \end{aligned}
122 | $$
123 | 
124 | 给定$X_1$求$X_2$的后验分布（这部分推导可以从相关文献中查到，此处略）：
125 | 
126 | $$
127 | \mu_{2|1} = \mu_2 + \Sigma_{21} \Sigma_{11}^{-1}(X_1 - \mu_1)
128 | $$
129 | 
130 | $$
131 | \Sigma_{2|1} = \Sigma_{22} - \Sigma_{21} \Sigma_{11}^{-1} \Sigma_{12}
132 | $$
133 | 
134 | 
135 | ***
136 | ## 三、高斯过程回归
137 | 
138 | 设随机变量$X = [x_1, x_2, x_3, ..., x_n]^T$，其服从正态分布：
139 | $$
140 | X \sim N(\mu, \Sigma)
141 | $$
142 | 其中$\mu = [\mu_0, \mu_1, ..., \mu_n]^T$为均值向量，$\Sigma$是这$n$个特征之间的协方差矩阵，将$\Sigma$展开有：
143 | 
144 | $$
145 | \Sigma =
146 | \left[
147 | 	\begin{matrix}
148 | 		& cov_{1, 1}, & cov_{1, 2}, ..., & cov_{1, n} \\
149 | 		& cov_{2, 1}, & cov_{2, 2}, ..., & cov_{2, n} \\
150 | 		& ..., & ..., &... \\
151 | 		& cov_{n, 1}, & cov_{n, 2}, ..., & cov_{n, n} \\
152 | 	\end{matrix}
153 | \right]
154 | $$
155 | 
156 | 其中$cov_{i,j}$表示特征$i$和特征$j$之间的协方差（covariance）。
157 | 
158 | 高斯过程样本与一般机器学习的样本区别在于，高斯过程中样本各特征之间存在相关关系，这种相关关系是通过协方差矩阵$\Sigma$来体现的。比如在一些时间序列模型里面，各个变量输出的时间序列在时间前后都会体现出一种相关性（比如平滑过渡等），这种模型输出就很适合使用高斯过程来模拟。
159 | 
160 | 
161 | ***3.1 协方差矩阵计算***  
162 | $\Sigma$可以通过高斯过程核进行求解，常见的高斯过程核有：
163 | 
164 | *RBF kernel:*
165 | 
166 | $$
167 | k = \sigma^2 \exp(-\frac{||t_a - t_b||^2}{2l^2})
168 | $$
169 | 
170 | ![RBF核](img/rbf_kernel.png)
171 | 
172 | *periodic kernel:*
173 | 
174 | $$
175 | k = \sigma^2 \exp(-\frac{2}{l^2} \sin(\frac{\pi}{p})|t_a - t_b|)
176 | $$
177 | 
178 | ![periodicF核](img/periodic_kernel.png)
179 | 
180 | *linear_kernel:*
181 | 
182 | $$
183 | k = \sigma_b^2 + \sigma^2 * (t_a - c)(t_b - c)
184 | $$
185 | 
186 | ![linear核](img/linear_kernel.png)
187 | 
188 | 这样当知道两个随机变量指数$t_a$和$t_b$后，便可通过核函数计算两个变量间的协方差。如果对所有随机变量均进行上述计算便可获得协方差矩阵$\Sigma$。有了协方差矩阵$\Sigma$后便可对高斯过程进行采样（一般认为高斯过程先验分布均值$\mu$应无偏为0）。
189 | 
190 | 
191 | 
192 | ***3.2 高斯过程采样***  
193 | 获得了各随机变量$x$的均值信息$\mu$和联合分布的协方差矩阵$\Sigma$后，便可对该高斯过程进行随机采样。采样步骤如下：
194 | 
195 | 1. 首先对协方差矩阵$\Sigma$进行SVD分解，获得矩阵$\rm U$、$\rm S$和$\rm V$；
196 | 
197 | 2. 生成$N$个独立同分布的高斯随机变量（均值为0，标准差为1），组成向量$y$；
198 | 
199 | 3. 按照如下公式获得高斯过程样本：
200 | 
201 |    $$
202 |    x = \mu + {\rm U} \sqrt{\rm S}y
203 |    $$
204 | 
205 | 采样结果如下图所示，图中每条灰色曲线便对应一条高斯过程样本（$n=100$）,蓝色曲线表示样本均值，因为我们设定先验分布各维度上均值$\mu_i=0$，所以蓝色曲线在0附近波动。
206 | 
207 | ![高斯过程采样](img/gaussian_process_samples.png)
208 | 
209 | 
210 | ***3.3 后验分布和采样***  
211 | 3.2中获得的高斯过程样本为先验样本。但是当我们在某些指数$t$上获得了一批观测样本后，这批观测样本将有助于我们对其他指数集上的样本分布进行估计（后验）。我们将这批已观测指数集设为$X_1$，未观测到的指数集设为$X_2$。接下来便可使用第二节中的方法获得在$X_2$上样本分布后验概率参数$\mu_{2|1}$和$\Sigma_{2|1}$，最后重新对$X_2$上的随机变量进行采样。下图显示了后验分布样本：
212 | 
213 | ![](img/posterior.png)
214 | 


--------------------------------------------------------------------------------
/mod/config/config_loader.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on 2021/02/27 21:13:39
  4 | 
  5 | @File -> config_loader.py
  6 | 
  7 | @Author: luolei
  8 | 
  9 | @Email: dreisteine262@163.com
 10 | 
 11 | @Describe: 项目配置工具
 12 | """
 13 | 
 14 | import logging
 15 | 
 16 | logging.basicConfig(level=logging.INFO)
 17 | 
 18 | import matplotlib.pyplot as plt
 19 | import logging.config
 20 | import logging
 21 | import yaml
 22 | import sys
 23 | import os
 24 | 
 25 | sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
 26 | 
 27 | SMALL_SIZE = 6
 28 | MEDIUM_SIZE = 8
 29 | BIGGER_SIZE = 10
 30 | 
 31 | plt.rc('font', size=BIGGER_SIZE, family='Times New Roman')
 32 | plt.rc('axes', titlesize=BIGGER_SIZE)
 33 | plt.rc('axes', labelsize=BIGGER_SIZE)
 34 | plt.rc('xtick', labelsize=BIGGER_SIZE)
 35 | plt.rc('ytick', labelsize=BIGGER_SIZE)
 36 | plt.rc('legend', fontsize=BIGGER_SIZE)
 37 | plt.rc('figure', titlesize=20)
 38 | plt.rc('mathtext', fontset = 'cm')
 39 | 
 40 | _color_map = {
 41 |     'blue': '#1f77b4',  # 蓝色
 42 |     'orange': '#ff7f0e',  # 黄橙色
 43 |     'green': '#2ca02c',  # 绿色
 44 |     'red': '#d62728',  # 红色
 45 |     'purple': '#9467bd',  # 紫色
 46 |     'cyan': '#17becf',  # 青色
 47 |     'grey': '#7f7f7f',  # 灰色
 48 |     'black': 'k',  # 黑色
 49 |     'white': 'w',
 50 | 
 51 |     # 类似色搭配互补色, 同一色系list中颜色由亮到暗排列.
 52 |     'similar-complement-cmap': {
 53 |             'greens': ['#5ED1BA', '#34D1B2', '#00A383', '#1F7A68', '#006A55'],
 54 |             'reds': ['#F97083', '#F93E58', '#F30021', '#B62E40s', '#9E0016'],
 55 |             'yellows': ['#FFCB73', '#FFB840', '#FFA100', '#BF8A30', '#A66900'],
 56 |             'oranges': ['#FFAA73', '#FF8B40', '#FF6400', '#BF6830', '#A64100'],
 57 |     }
 58 | }
 59 | 
 60 | 
 61 | def _load_yml(fp: str) -> dict:
 62 |     with open(fp, 'r', encoding='utf-8') as f:
 63 |         conf = yaml.load(f, Loader=yaml.Loader)  # yaml.FullLoader
 64 | 
 65 |     if conf is None:
 66 |         conf = {}
 67 | 
 68 |     return conf
 69 | 
 70 | 
 71 | class ConfigLoader(object):
 72 |     """项目配置装载器"""
 73 | 
 74 |     def __init__(self):
 75 |         self._get_proj_root_dir()
 76 |         self._config_path = os.path.join(self.proj_dir, 'config/')
 77 |         self._set_proj_cmap()
 78 |         self._load_model_config()
 79 |         self._load_environ_config()
 80 |         self._load_test_params()
 81 | 
 82 |     def _get_proj_root_dir(self):
 83 |         """获取项目根目录"""
 84 |         self._proj_dir = os.path.abspath(
 85 |             os.path.join(os.path.dirname(__file__), '../../'))
 86 | 
 87 |     @property
 88 |     def proj_dir(self):
 89 |         return self._proj_dir
 90 | 
 91 |     def _set_proj_cmap(self):
 92 |         """设置项目颜色方案"""
 93 |         self._proj_cmap = _color_map
 94 | 
 95 |     @property
 96 |     def proj_cmap(self):
 97 |         return self._proj_cmap
 98 | 
 99 |     def _load_model_config(self):
100 |         """载入模型参数配置文件"""
101 |         self._model_config_path = os.path.join(
102 |             self._config_path, 'model_config.yml')
103 |         self._model_config = _load_yml(self._model_config_path)
104 | 
105 |     @property
106 |     def proj_plt(self):
107 |         return plt
108 | 
109 |     @property
110 |     def model_config(self):
111 |         return self._model_config
112 | 
113 |     def _load_environ_config(self):
114 |         """载入环境变量配置"""
115 |         # 读取本地文件中的环境变量设置.
116 |         # 如果本地config中有master.yml则优先使用, 否则使用default.yml, 否则为空字典.
117 |         _environ_config_path_ = None
118 |         for _file_name in ['master.yml', 'default.yml']:
119 |             if _file_name in os.listdir(self._config_path):
120 |                 print('Use environmental variables in {}'.format(_file_name))
121 |                 _environ_config_path_ = os.path.join(
122 |                     self._config_path, _file_name)
123 |                 break
124 | 
125 |         if _environ_config_path_ is None:
126 |             self._local_environ_config = {}
127 |         else:
128 |             self._local_environ_config = _load_yml(_environ_config_path_)
129 | 
130 |         # 线上环境变量注入.
131 |         # 如果存在可注入环境变量, 则采用注入值, 否则采用环境变量配置文件中的值.
132 |         self._environ_config = self._local_environ_config
133 |         for key in self._local_environ_config.keys():
134 |             if key in os.environ.keys():
135 |                 self._environ_config.update({key: os.environ[key]})
136 | 
137 |     @property
138 |     def environ_config(self):
139 |         return self._environ_config
140 | 
141 |     def _load_test_params(self):
142 |         _test_params_path = os.path.join(self._config_path, 'test_params.yml')
143 |         self._test_params = _load_yml(_test_params_path)
144 | 
145 |     @property
146 |     def test_params(self):
147 |         return self._test_params
148 | 
149 |     def set_logging(self):
150 |         """日志配置"""
151 |         # 检查本地是否有日志目录, 若没有则创建.
152 |         if 'logs' not in os.listdir(self.proj_dir):
153 |             os.mkdir(os.path.join(self.proj_dir, 'logs/'))
154 | 
155 |         # 配置日志.
156 |         try:
157 |             _log_config = self._model_config['logging']
158 |         except Exception as e:
159 |             raise RuntimeError(
160 |                 'Cannot load logging params in model_config.yml, {}'.format(e))
161 | 
162 |         logging.config.dictConfig(_log_config)
163 | 
164 | 
165 | config_loader = ConfigLoader()
166 | 


--------------------------------------------------------------------------------