├── .gitignore ├── LearningWithExpertKnowledge ├── data │ ├── asian.csv │ ├── asian.png │ ├── asian_expert.csv │ ├── asian_expert.xlsx │ └── data.xlsx ├── estimator.py ├── expert.py ├── graph.py ├── log.txt ├── readme.md ├── results │ ├── dag7.xlsx │ ├── dag8.xlsx │ └── dag9.xlsx ├── rpv_data │ ├── data3-密码为实验室位置6位小写.zip │ ├── expert_knowledge.xlsx │ ├── expert_knowledge2.xlsx │ ├── expert_knowledge3.xlsx │ └── expert_knowledge4.xlsx └── run │ ├── DAG.png │ ├── dag7.xlsx │ ├── log.txt │ ├── run0.py │ ├── run1.py │ ├── run2.py │ ├── run3.py │ ├── run4.py │ ├── run5.py │ ├── run6.py │ ├── run7.py │ ├── run8.py │ └── run9.py ├── main.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | /.idea/ 3 | /.vscode/ 4 | /venv/ 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | *.csv 53 | *.CSV 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | .idea/ 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | workspace.xml 86 | # SageMath parsed files 87 | *.sage.py 88 | .idea/workspace.xml 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | -------------------------------------------------------------------------------- /LearningWithExpertKnowledge/data/asian.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Howardhuang98/Bayesian_network_learning/0c5703b3eae5a71dbc97acd5dd48aa818c61ac4f/LearningWithExpertKnowledge/data/asian.png -------------------------------------------------------------------------------- /LearningWithExpertKnowledge/data/asian_expert.csv: -------------------------------------------------------------------------------- 1 | ,smoke,bronc,lung,asia,tub,either,dysp,xray 2 | smoke,0,0.8,0.9,0.1,0.1,0.1,0.1,0.1 3 | bronc,0.1,0,0,0,0.1,0.1,0.7,0.1 4 | lung,0.1,0.2,0,0.2,0.3,0.8,0.1,0.1 5 | asia,0.2,0.1,0.1,0,0.8,0.3,0.5,0.2 6 | tub,0.1,0.5,0.3,0.3,0,0.7,0.1,0.2 7 | either,0.2,0.3,0.1,0.2,0.2,0,0.8,0.6 8 | dysp,0,0,0.1,0.1,0.3,0,0,0 9 | xray,0.1,0.1,0.2,0,0,0,0.2,0 10 | -------------------------------------------------------------------------------- /LearningWithExpertKnowledge/data/asian_expert.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Howardhuang98/Bayesian_network_learning/0c5703b3eae5a71dbc97acd5dd48aa818c61ac4f/LearningWithExpertKnowledge/data/asian_expert.xlsx -------------------------------------------------------------------------------- /LearningWithExpertKnowledge/data/data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Howardhuang98/Bayesian_network_learning/0c5703b3eae5a71dbc97acd5dd48aa818c61ac4f/LearningWithExpertKnowledge/data/data.xlsx -------------------------------------------------------------------------------- /LearningWithExpertKnowledge/estimator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import deque 3 | from itertools import permutations 4 | 5 | import networkx as nx 6 | import numpy as np 7 | from tqdm import trange 8 | 9 | from LearningWithExpertKnowledge.expert import * 10 | from LearningWithExpertKnowledge.graph import DAG 11 | 12 | 13 | class Estimator: 14 | def __init__(self, data: pd.DataFrame, expert: ExpertKnowledge, k=10000): 15 | self.data = data 16 | self.expert = expert 17 | self.k = k 18 | self.DAG = DAG() 19 | self.vars = data.columns 20 | self.state_names = { 21 | var: self._collect_state_names(var) for var in self.vars 22 | } 23 | # 检查data的columns与expert的columns是否符合 24 | for var in self.vars: 25 | if var in data.columns: 26 | continue 27 | else: 28 | raise ValueError("专家信息与data不符!") 29 | # log 文件设置 30 | logging.basicConfig(filename='log.txt', level=0, filemode="w", format="") 31 | logging.info("*****日志文件*****") 32 | logging.info("数据预览:") 33 | logging.info(self.data.head(5)) 34 | logging.info("专家知识预览:") 35 | logging.info(self.expert.data) 36 | 37 | def _collect_state_names(self, variable): 38 | """ 39 | 收集该变量的状态名 40 | :param variable: 41 | :return: 42 | """ 43 | states = sorted(list(self.data.loc[:, variable].dropna().unique())) 44 | return states 45 | 46 | def state_counts(self, variable, parents=None): 47 | """ 48 | 49 | :param variable: 50 | :param parents: 51 | :return: 52 | """ 53 | if parents is None: 54 | parents = [] 55 | parents = list(parents) 56 | 57 | # ignores either any row containing NaN, or only those where the variable or its parents is NaN 58 | data = self.data 59 | 60 | if not parents: 61 | # count how often each state of 'variable' occurred 62 | state_count_data = data.loc[:, variable].value_counts() 63 | state_counts = ( 64 | state_count_data.reindex(self.state_names[variable]).fillna(0).to_frame() 65 | ) 66 | 67 | else: 68 | parents_states = [self.state_names[parent] for parent in parents] 69 | # count how often each state of 'variable' occurred, conditional on parents' states 70 | state_count_data = ( 71 | data.groupby([variable] + parents).size().unstack(parents) 72 | ) 73 | if not isinstance(state_count_data.columns, pd.MultiIndex): 74 | state_count_data.columns = pd.MultiIndex.from_arrays( 75 | [state_count_data.columns] 76 | ) 77 | 78 | # reindex rows & columns to sort them and to add missing ones 79 | # missing row = some state of 'variable' did not occur in data 80 | # missing column = some state configuration of current 'variable's parents 81 | # did not occur in data 82 | row_index = self.state_names[variable] 83 | column_index = pd.MultiIndex.from_product(parents_states, names=parents) 84 | state_counts = state_count_data.reindex( 85 | index=row_index, columns=column_index 86 | ).fillna(0) 87 | 88 | return state_counts 89 | 90 | def expert_score(self, variable, parents): 91 | """ 92 | 专家评分部分 93 | :param variable: 94 | :param parents: 95 | :return: 96 | """ 97 | parents = set(parents) 98 | sample_size = len(self.data) 99 | # 专家分数计算 100 | score = 1 101 | for node in self.vars: 102 | thinks = self.expert.think(variable, node) 103 | if node == variable: 104 | continue 105 | elif node in parents: 106 | score *= thinks[1] 107 | else: 108 | score *= thinks[2] 109 | # 评分两极化处理 110 | # 若信息全部是0.333,那么score = 0.333**(len(self.vars)-1) 111 | zero_point = 0.333 ** (len(self.vars) - 1) 112 | if score > zero_point: 113 | score = 10e17/(1-zero_point) * (score - zero_point) 114 | else: 115 | score = (-10e17)/(0-zero_point) * (score - zero_point) 116 | 117 | # 考虑样本影响: 118 | score *= self.k / sample_size 119 | 120 | return score 121 | 122 | def score_function(self, variable, parents): 123 | """ 124 | 125 | :param variable: 126 | :param parents: 一定要是list 127 | :return: 128 | """ 129 | var_states = self.state_names[variable] 130 | var_cardinality = len(var_states) 131 | state_counts = self.state_counts(variable, parents) 132 | sample_size = len(self.data) 133 | num_parents_states = float(state_counts.shape[1]) 134 | 135 | counts = np.asarray(state_counts) 136 | log_likelihoods = np.zeros_like(counts, dtype=np.float_) 137 | 138 | # Compute the log-counts 139 | np.log(counts, out=log_likelihoods, where=counts > 0) 140 | 141 | # Compute the log-conditional sample size 142 | log_conditionals = np.sum(counts, axis=0, dtype=np.float_) 143 | np.log(log_conditionals, out=log_conditionals, where=log_conditionals > 0) 144 | # Compute the log-likelihoods 145 | log_likelihoods -= log_conditionals 146 | log_likelihoods *= counts 147 | 148 | likelihood_score = np.sum(log_likelihoods) 149 | 150 | expert_score = self.expert_score(variable=variable, parents=parents) 151 | score = likelihood_score + expert_score 152 | logging.info("{}与{}组成的部分结构,得分为:{}+{}={}".format(variable, parents, likelihood_score, expert_score, score)) 153 | 154 | return score 155 | 156 | def legal_operations(self, tabu_list): 157 | tabu_list = set(tabu_list) 158 | potential_new_edges = ( 159 | set(permutations(self.vars, 2)) 160 | - set(self.DAG.edges()) 161 | - set([(Y, X) for (X, Y) in self.DAG.edges()]) 162 | ) 163 | for (X, Y) in potential_new_edges: 164 | # Check if adding (X, Y) will create a cycle. 165 | if not nx.has_path(self.DAG, Y, X): 166 | operation = ("+", (X, Y)) 167 | if operation not in tabu_list: 168 | old_parents = self.DAG.get_parents(Y) 169 | new_parents = old_parents + [X] 170 | score_delta = self.score_function(Y, new_parents) - self.score_function(Y, old_parents) 171 | yield (operation, score_delta) 172 | 173 | for (X, Y) in self.DAG.edges(): 174 | operation = ("-", (X, Y)) 175 | if operation not in tabu_list: 176 | old_parents = self.DAG.get_parents(Y) 177 | new_parents = old_parents[:] 178 | new_parents.remove(X) 179 | score_delta = self.score_function(Y, new_parents) - self.score_function(Y, old_parents) 180 | yield (operation, score_delta) 181 | 182 | for (X, Y) in self.DAG.edges(): 183 | # Check if flipping creates any cycles 184 | if not any( 185 | map(lambda path: len(path) > 2, nx.all_simple_paths(self.DAG, X, Y)) 186 | ): 187 | operation = ("flip", (X, Y)) 188 | if operation not in tabu_list: 189 | old_X_parents = self.DAG.get_parents(X) 190 | old_Y_parents = self.DAG.get_parents(Y) 191 | new_X_parents = old_X_parents + [Y] 192 | new_Y_parents = old_Y_parents[:] 193 | new_Y_parents.remove(X) 194 | score_delta = ( 195 | self.score_function(X, new_X_parents) 196 | + self.score_function(Y, new_Y_parents) 197 | - self.score_function(X, old_X_parents) 198 | - self.score_function(Y, old_Y_parents) 199 | ) 200 | yield (operation, score_delta) 201 | 202 | def run(self, epsilon=1e-4, max_iter=1e6): 203 | """ 204 | 205 | :param epsilon: 206 | :param max_iter: 207 | :return: 208 | """ 209 | ######## 210 | # 初始检查:略去 211 | ######## 212 | # 初始化 213 | start_dag = self.DAG 214 | start_dag.add_nodes_from(self.vars) 215 | tabu_list = deque(maxlen=100) 216 | current_model = start_dag 217 | # 每次迭代,找到最佳的 (operation, score_delta) 218 | iteration = trange(int(max_iter)) 219 | for _ in iteration: 220 | logging.debug(current_model.edges) 221 | best_operation, best_score_delta = max( 222 | self.legal_operations(tabu_list), 223 | key=lambda t: t[1], 224 | ) 225 | logging.info("搜索到的最佳操作为:{}".format(best_operation)) 226 | if best_operation is None or best_score_delta < epsilon: 227 | break 228 | elif best_operation[0] == "+": 229 | current_model.add_edge(*best_operation[1]) 230 | tabu_list.append(("-", best_operation[1])) 231 | elif best_operation[0] == "-": 232 | current_model.remove_edge(*best_operation[1]) 233 | tabu_list.append(("+", best_operation[1])) 234 | elif best_operation[0] == "flip": 235 | X, Y = best_operation[1] 236 | current_model.remove_edge(X, Y) 237 | current_model.add_edge(Y, X) 238 | tabu_list.append(best_operation) 239 | return current_model 240 | 241 | def mic_of_edge(self, u, v): 242 | """ 243 | 计算一对边之间的相关性,MIC 244 | 参考文献:Detecting novel associations in large data sets[J]. science, 2011, 334(6062): 1518-1524. 245 | :param u: 246 | :param v: 247 | :return: 248 | """ 249 | pass 250 | 251 | def corr_of_edges(self, u, v): 252 | """ 253 | 计算两个节点之间的相关系数 254 | ps:相关系数衡量随机变量X与Y相关程度的一种方法,相关系数的取值范围是[-1,1]。 255 | 相关系数的绝对值越大,则表明X与Y相关度越高。 当X与Y线性相关时,相关系数取值为1(正线性相关)或-1(负线性相关) 256 | :param u: 257 | :param v: 258 | :return: 259 | """ 260 | var1 = self.data[u].values 261 | var2 = self.data[v].values 262 | corr = np.corrcoef(var1, var2)[0][1] 263 | return corr 264 | 265 | def add_weight_to_edges(self): 266 | """ 267 | 给每条边,根据corr增加权重,经过变换: 268 | 100:最远,相关性最弱 269 | 0:最近,相关性最强 270 | :return: 271 | """ 272 | if self.DAG.edges is None: 273 | print("No edge was found!") 274 | return None 275 | for edge in self.DAG.edges: 276 | weight = (1 - abs(self.corr_of_edges(edge[0], edge[1]))) * 100 277 | self.DAG[edge[0]][edge[1]]["weight"] = weight 278 | 279 | def importance_of_node(self, node): 280 | """ 281 | 计算该节点的重要度 282 | 参考文献:复杂网络中节点重要度评估的节点收缩方法[D]. , 2006. 283 | :param node: 284 | :return: 285 | """ 286 | # 计算距离矩阵 287 | distance_matrix = nx.floyd_warshall_numpy(self.DAG, weight="weight") 288 | # 计算初始网络的凝聚度 289 | where_are_inf = np.isinf(distance_matrix) 290 | _distance_matrix = distance_matrix 291 | _distance_matrix[where_are_inf] = 0 292 | cohesion_of_initial_network = (len(self.DAG.nodes) - 1) / _distance_matrix.sum() 293 | # 对node进行节点收缩 294 | # 当对node进行节点收缩时,相当于把node的所有相邻节点到node的距离变为0 295 | 296 | def centrality_of_nodes(self): 297 | centrality = nx.katz_centrality(self.DAG, weight="weight") 298 | return centrality 299 | 300 | 301 | if __name__ == '__main__': 302 | chen_data = pd.DataFrame({ 303 | "A": [0, 0.8, 0, 0.3], 304 | "B": [0.1, 0, 0.3, 0.9], 305 | "C": [1, 0.2, 0, 0.1], 306 | "D": [0.3, 0.2, 0.1, 0] 307 | }, index=["A", "B", "C", "D"]) 308 | print(chen_data) 309 | chen = ExpertKnowledge(data=chen_data) 310 | data = pd.read_excel(r"./data/data.xlsx") 311 | a = Estimator(data=data, expert=chen) 312 | a.run() 313 | print(a.corr_of_edges('A', 'B')) 314 | a.add_weight_to_edges() 315 | print(a.DAG.edges.data()) 316 | print(a.centrality_of_nodes()) 317 | -------------------------------------------------------------------------------- /LearningWithExpertKnowledge/expert.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | """ 4 | 用表格的方式来记录专家知识 5 | 专家知识的表达形式: 6 | 行指向列! 7 | A B C D 8 | A 0 0.1 0.5 0.3 9 | B 0.8 0 0.2 0.2 10 | C 0.7 0.3 0 0.1 11 | D 0.3 0.9 0.1 0 12 | """ 13 | 14 | 15 | class ExpertKnowledge: 16 | def __init__(self, data: pd.DataFrame): 17 | self.data = data 18 | self.variables = data.columns 19 | # 此处最好能做一个检查,确保values[i,j]+values[j,i]<=1 20 | # 21 | # 待补充 22 | # 23 | 24 | def think(self, u, v): 25 | """ 26 | 专家对于u->v,u<-v,u>